In [4]:
#!/usr/bin/env python3
"""
Comprehensive SAE-based representation shift analysis with layer sweeping,
real datasets, and patching logic for LLM->VLM adaptation studies.
"""
# Installation requirements:
"""
pip install sae-lens transformers torch matplotlib seaborn numpy datasets tqdm

# For CUDA support (recommended):
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
"""

import torch
import numpy as np
import os
import gc
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM
from sae_lens import SAE
import matplotlib.pyplot as plt
from typing import Dict, Tuple, List, Optional, Union
from dataclasses import dataclass
import seaborn as sns
from datasets import load_dataset
import json
from tqdm import tqdm
import warnings
import random
import json
warnings.filterwarnings("ignore")



In [1]:
"""
Comprehensive SAE-based representation shift analysis with layer sweeping,
real datasets, and patching logic for LLM->VLM adaptation studies.
FIXED: Handles PaliGemma loss computation correctly.
"""

import torch
import numpy as np
import os
import gc
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM
from sae_lens import SAE
import matplotlib.pyplot as plt
from typing import Dict, Tuple, List, Optional, Union
from dataclasses import dataclass
import seaborn as sns
from datasets import load_dataset
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Disable gradients globally for memory efficiency
torch.set_grad_enabled(False)

@dataclass
class SAEMetrics:
    """Container for SAE evaluation metrics."""
    reconstruction_loss: float
    l0_sparsity: float
    l1_sparsity: float
    fraction_alive: float
    mean_max_activation: float
    reconstruction_score: float
    model_delta_loss: float 
    rec_loss_topk: float

@dataclass
class RepresentationShift:
    """Container for representation shift metrics."""
    cosine_similarity: float
    l2_distance: float
    feature_overlap: float
    js_divergence: float
    feature_correlation: float

class DatasetLoader:
    """Handles loading and preprocessing of various datasets."""
    
    def __init__(self, device: str = "cuda"):
        self.device = device
    
    def load_cifar100_captions(self, split: str = "train", max_samples: int = 100) -> List[str]:
        """Load CIFAR-100 with generated captions for multimodal analysis."""
        try:
            # CIFAR-100 doesn't have captions by default, so we create descriptive ones
            dataset = load_dataset("cifar100", split=split)
            
            # CIFAR-100 class names
            class_names = [
                'apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle',
                'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel',
                'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock',
                'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur',
                'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster',
                'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion',
                'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse',
                'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear',
                'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine',
                'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose',
                'sea', 'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake',
                'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table',
                'tank', 'telephone', 'television', 'tiger', 'tractor', 'train', 'trout',
                'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm'
            ]
            
            texts = []
            for i, sample in enumerate(dataset):
                if i >= max_samples:
                    break
                class_name = class_names[sample['fine_label']]
                # Generate descriptive captions
                captions = [
                    f"This is a photo of a {class_name}.",
                    f"An image showing a {class_name}.",
                    f"A picture of a {class_name} in natural setting.",
                    f"Visual representation of a {class_name}."
                ]
                texts.extend(captions[:2])  # Take 2 captions per image
            
            print(f"✅ Loaded {len(texts)} CIFAR-100 captions")
            return texts[:max_samples]
            
        except Exception as e:
            print(f"❌ Error loading CIFAR-100: {e}")
            return self._get_fallback_texts()
    
    def load_coco_captions(self, split: str = "validation", max_samples: int = 100) -> List[str]:
        """Load COCO captions dataset."""
        try:
            # Load COCO captions
            dataset = load_dataset("HuggingFaceM4/COCO", split=split)
            
            texts = []
            for i, sample in enumerate(dataset):
                if i >= max_samples:
                    break
                
                # COCO has multiple captions per image
                if 'sentences' in sample and 'raw' in sample['sentences']:
                    for sentence in sample['sentences']['raw'][:2]:  # Take first 2 captions
                        texts.append(sentence)
                elif 'caption' in sample:
                    texts.append(sample['caption'])
            
            print(f"✅ Loaded {len(texts)} COCO captions")
            return texts[:max_samples]
            
        except Exception as e:
            print(f"❌ Error loading COCO: {e}")
            # Try alternative COCO dataset
            try:
                dataset = load_dataset("nielsr/coco-captions", split="validation")
                texts = [sample['caption'] for sample in dataset.select(range(min(max_samples, len(dataset))))]
                print(f"✅ Loaded {len(texts)} COCO captions (alternative)")
                return texts
            except:
                return self._get_fallback_texts()
    
    def load_llava_bench(self, max_samples: int = 100) -> List[str]:
        """Load LLaVA-Bench questions/descriptions."""
        try:
            # LLaVA bench conversations
            dataset = load_dataset("lmms-lab/LLaVA-OneVision-Data", split="dev_mini")
            
            texts = []
            for i, sample in enumerate(dataset):
                if i >= max_samples:
                    break
                
                if 'conversations' in sample:
                    for conv in sample['conversations'][:2]:  # Take first 2 conversations
                        if 'value' in conv:
                            texts.append(conv['value'])
            
            print(f"✅ Loaded {len(texts)} LLaVA-Bench texts")
            return texts[:max_samples]
            
        except Exception as e:
            print(f"❌ Error loading LLaVA-Bench: {e}")
            return self._get_fallback_texts()
    
    def _get_fallback_texts(self) -> List[str]:
        """Fallback texts if datasets fail to load."""
        return [
            "A photo of a red apple on a white background.",
            "The cat is sitting on a wooden chair.",
            "Mountains covered with snow in winter landscape.",
            "A blue car driving on a highway.",
            "Children playing in a park with green grass.",
            "A delicious chocolate cake on a plate.",
            "Ocean waves crashing against rocky shore.",
            "A person reading a book in a library.",
            "Colorful flowers blooming in spring garden.",
            "A dog running happily in the field.",
        ]
    
    def get_mixed_dataset(self, total_samples: int = 150) -> List[str]:
        """Get a mixed dataset from multiple sources."""
        samples_per_source = total_samples // 3
        
        texts = []
        texts.extend(self.load_cifar100_captions(max_samples=samples_per_source))
        texts.extend(self.load_coco_captions(max_samples=samples_per_source))
        texts.extend(self.load_llava_bench(max_samples=samples_per_source))
        
        # Shuffle for good measure
        import random
        random.shuffle(texts)
        
        return texts[:total_samples]

class MemoryEfficientSAEAnalyzer:
    """Memory-efficient SAE analyzer with layer sweeping and patching logic."""
    
    def __init__(self, 
                 model_size: str = "2b",
                 width: str = "16k", 
                 suffix: str = "canonical",
                 device: str = "cuda",
                 output_dir: str = "../figs_tabs"):
        """
        Initialize memory-efficient SAE analyzer.
        
        Args:
            model_size: Model size ("2b" or "9b")
            width: SAE width ("16k", "65k", "262k")
            suffix: SAE variant ("canonical" or specific L0)
            device: Device to use
            output_dir: Directory for saving outputs
        """
        self.device = device if torch.cuda.is_available() else "cpu"
        self.model_size = model_size
        self.width = width
        self.suffix = suffix
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Model cache for memory efficiency
        self.model_cache = {}
        self.sae_cache = {}
        
        print(f"🔧 Initialized SAE Analyzer")
        print(f"   Device: {self.device}")
        print(f"   Model Size: {model_size}")
        print(f"   SAE Width: {width}")
        print(f"   Output Dir: {output_dir}")

    def get_gemmascope_sae(self, layer: int) -> SAE:
        """Load Gemma Scope SAE with caching for memory efficiency."""
        cache_key = f"layer_{layer}"
        
        if cache_key in self.sae_cache:
            return self.sae_cache[cache_key]
        
        release = f"gemma-scope-{self.model_size}-pt-res"
        if self.suffix == "canonical":
            release = f"gemma-scope-{self.model_size}-pt-res-canonical"
            sae_id = f"layer_{layer}/width_{self.width}/canonical"
        else:
            sae_id = f"layer_{layer}/width_{self.width}/{self.suffix}"
        
        print(f"   📥 Loading SAE Layer {layer}: {sae_id}")
        
        try:
            sae = SAE.from_pretrained(release, sae_id).to(self.device)
            sae.eval()
            
            # Cache management - keep only last 2 SAEs to save memory
            if len(self.sae_cache) >= 2:
                oldest_key = list(self.sae_cache.keys())[0]
                del self.sae_cache[oldest_key]
                gc.collect()
            
            self.sae_cache[cache_key] = sae
            return sae
            
        except Exception as e:
            print(f"❌ Error loading SAE layer {layer}: {e}")
            raise

    def get_model(self, model_name: str):
        """Load model with caching and proper device placement."""
        if model_name in self.model_cache:
            return self.model_cache[model_name]
        
        print(f"📥 Loading model: {model_name}")
        
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            
            # Handle different model types
            if "paligemma" in model_name.lower():
                from transformers import PaliGemmaForConditionalGeneration
                model = PaliGemmaForConditionalGeneration.from_pretrained(
                    model_name, 
                    trust_remote_code=True,
                    torch_dtype=torch.float32,  # Use fp16 for memory efficiency
                    device_map=None  # We'll handle device placement manually
                )
                model = model.to(self.device)
                language_model = model.language_model
            else:
                model = AutoModelForCausalLM.from_pretrained(
                    model_name, 
                    trust_remote_code=True,
                    torch_dtype=torch.float32,
                    device_map=None
                )
                model = model.to(self.device)
                language_model = model
            
            language_model.eval()
            
            # Cache management - keep only one model at a time
            if len(self.model_cache) >= 1:
                for cached_name in list(self.model_cache.keys()):
                    del self.model_cache[cached_name]
                gc.collect()
                torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
            self.model_cache[model_name] = (tokenizer, model, language_model)
            return tokenizer, model, language_model
            
        except Exception as e:
            print(f"❌ Error loading model {model_name}: {e}")
            raise

    def extract_activations_with_patching(self, 
                                        model_name: str, 
                                        text: str, 
                                        layer: int,
                                        sae: Optional[SAE] = None) -> Tuple[torch.Tensor, float]:
        """
        Extract activations and compute model delta loss with patching.
        FIXED: Addresses CUDA device-side assert errors with proper tokenization and loss computation.

        Returns:
            Tuple of (activations, model_delta_loss)
        """
        tokenizer, model, language_model = self.get_model(model_name)

        # FIXED: More robust tokenization with proper padding token handling
        # Ensure we have a pad token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id

        # Tokenize with safer parameters
        inputs = tokenizer(
            text, 
            return_tensors="pt", 
            padding="max_length",
            truncation=True,
            max_length=64,
            add_special_tokens=True  # Ensure special tokens are added properly
        )

        # FIXED: Validate token IDs are within vocabulary range
        vocab_size = tokenizer.vocab_size
        input_ids = inputs['input_ids']

        # Check for out-of-bounds token IDs
        if torch.any(input_ids >= vocab_size) or torch.any(input_ids < 0):
            print(f"⚠️  Invalid token IDs detected. Max ID: {input_ids.max()}, Vocab size: {vocab_size}")
            # Clamp invalid IDs to valid range
            input_ids = torch.clamp(input_ids, 0, vocab_size - 1)
            inputs['input_ids'] = input_ids

        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # FIXED: More robust label creation
        def create_labels(input_ids, pad_token_id):
            """Create labels with proper masking for loss computation"""
            labels = input_ids.clone()
            # Mask padding tokens
            labels[labels == pad_token_id] = -100
            # FIXED: Also mask the first token (often BOS) to avoid issues
            if labels.size(1) > 1:
                labels[:, 0] = -100
            return labels

        # Get unpatched model loss (baseline)
        unpatched_loss = 0.0
        with torch.no_grad():
            try:
                if "paligemma" in model_name.lower():
                    # For PaliGemma, we need to handle text-only input differently
                    labels = create_labels(inputs['input_ids'], tokenizer.pad_token_id)

                    # Get outputs from language model
                    unpatched_outputs = language_model(**inputs)

                    # Check if we have logits to compute loss
                    if hasattr(unpatched_outputs, 'logits'):
                        logits = unpatched_outputs.logits

                        # FIXED: More robust loss computation with better shape handling
                        if logits.size(1) > 1 and labels.size(1) > 1:
                            shift_logits = logits[..., :-1, :].contiguous()
                            shift_labels = labels[..., 1:].contiguous()

                            # Ensure we have valid data for loss computation
                            valid_mask = shift_labels != -100
                            if valid_mask.any():
                                shift_logits = shift_logits.view(-1, shift_logits.size(-1))
                                shift_labels = shift_labels.view(-1)

                                # FIXED: Use reduction='mean' and handle empty tensors
                                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
                                unpatched_loss = loss_fct(shift_logits, shift_labels).item()
                            else:
                                print("⚠️  No valid tokens for loss computation")
                                unpatched_loss = 0.0
                        else:
                            print("⚠️  Insufficient sequence length for loss computation")
                            unpatched_loss = 0.0
                    else:
                        # Fallback for models without logits
                        unpatched_loss = 0.0
                        print(f"⚠️  No logits available for {model_name}, using zero loss")

                else:
                    # For regular language models
                    labels = create_labels(inputs['input_ids'], tokenizer.pad_token_id)
                    unpatched_outputs = language_model(**inputs, labels=labels)

                    if hasattr(unpatched_outputs, 'loss') and unpatched_outputs.loss is not None:
                        unpatched_loss = unpatched_outputs.loss.item()
                    else:
                        # FIXED: Same robust loss computation as above
                        if hasattr(unpatched_outputs, 'logits'):
                            logits = unpatched_outputs.logits

                            if logits.size(1) > 1 and labels.size(1) > 1:
                                shift_logits = logits[..., :-1, :].contiguous()
                                shift_labels = labels[..., 1:].contiguous()

                                valid_mask = shift_labels != -100
                                if valid_mask.any():
                                    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
                                    shift_labels = shift_labels.view(-1)

                                    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
                                    unpatched_loss = loss_fct(shift_logits, shift_labels).item()
                                else:
                                    unpatched_loss = 0.0
                            else:
                                unpatched_loss = 0.0
                        else:
                            unpatched_loss = 0.0

            except Exception as e:
                print(f"⚠️  Error computing unpatched loss: {e}")
                unpatched_loss = 0.0

        # Extract activations from target layer
        activations = None
        patched_loss = unpatched_loss  # Default if no patching

        def activation_hook(module, input, output):
            nonlocal activations
            try:
                if isinstance(output, tuple):
                    activations = output[0].clone().detach()
                else:
                    activations = output.clone().detach()
            except Exception as e:
                print(f"⚠️  Error in activation hook: {e}")

        # FIXED: More robust layer identification
        target_layer = None
        try:
            if hasattr(language_model, 'model') and hasattr(language_model.model, 'layers'):
                if layer < len(language_model.model.layers):
                    target_layer = language_model.model.layers[layer]
                else:
                    print(f"❌ Layer {layer} out of range. Model has {len(language_model.model.layers)} layers")
                    return torch.randn(1, 64, 2304).to(self.device), 0.0
            elif hasattr(language_model, 'layers'):
                if layer < len(language_model.layers):
                    target_layer = language_model.layers[layer]
                else:
                    print(f"❌ Layer {layer} out of range. Model has {len(language_model.layers)} layers")
                    return torch.randn(1, 64, 2304).to(self.device), 0.0
            else:
                print(f"❌ Could not find layers in model structure")
                return torch.randn(1, 64, 2304).to(self.device), 0.0
        except Exception as e:
            print(f"❌ Error accessing layer {layer}: {e}")
            return torch.randn(1, 64, 2304).to(self.device), 0.0

        if target_layer is None:
            print(f"❌ Could not find layer {layer}")
            return torch.randn(1, 64, 2304).to(self.device), 0.0

        hook = target_layer.register_forward_hook(activation_hook)

        # Forward pass to get activations
        with torch.no_grad():
            try:
                if "paligemma" in model_name.lower():
                    _ = language_model(**inputs)
                else:
                    _ = language_model(**inputs)
            except Exception as e:
                print(f"⚠️  Error in activation extraction: {e}")

        hook.remove()

        # Compute patched loss if SAE is provided
        if sae is not None and activations is not None:
            patched_loss = self._compute_patched_loss(
                language_model, inputs, activations, sae, layer, model_name, tokenizer
            )

        model_delta_loss = patched_loss - unpatched_loss

        if activations is None:
            print(f"⚠️  Failed to extract activations from layer {layer}")
            # FIXED: Return appropriate tensor size based on model
            try:
                # Try to get the actual hidden size from the model config
                if hasattr(language_model, 'config') and hasattr(language_model.config, 'hidden_size'):
                    hidden_size = language_model.config.hidden_size
                else:
                    hidden_size = 2304  # fallback
                activations = torch.randn(1, 64, hidden_size).to(self.device)
            except:
                activations = torch.randn(1, 64, 2304).to(self.device)

        return activations, model_delta_loss

    def _compute_patched_loss(self, 
                            language_model, 
                            inputs: Dict, 
                            original_activations: torch.Tensor, 
                            sae: SAE, 
                            layer: int,
                            model_name: str,
                            tokenizer) -> float:
        """Compute loss with SAE-patched activations. FIXED: Robust error handling and loss computation."""
        try:
            # Get SAE reconstruction
            flat_activations = original_activations.view(-1, original_activations.size(-1))
            sae_output = sae(flat_activations)

            # Handle different SAE output formats
            if hasattr(sae_output, 'sae_out'):
                reconstructed = sae_output.sae_out
            elif isinstance(sae_output, tuple):
                reconstructed = sae_output[0]
            else:
                reconstructed = sae_output

            # Reshape back to original shape
            reconstructed = reconstructed.view(original_activations.shape)

            # Patch the reconstructed activations back into the model
            patched_activations = reconstructed.detach()  # FIXED: Ensure no gradients

            # Create a patching hook
            def patching_hook(module, input, output):
                try:
                    if isinstance(output, tuple):
                        return (patched_activations, *output[1:])
                    else:
                        return patched_activations
                except Exception as e:
                    print(f"⚠️  Error in patching hook: {e}")
                    return output  # Return original if patching fails

            # Hook the target layer for patching
            target_layer = None
            if hasattr(language_model, 'model') and hasattr(language_model.model, 'layers'):
                if layer < len(language_model.model.layers):
                    target_layer = language_model.model.layers[layer]
            elif hasattr(language_model, 'layers'):
                if layer < len(language_model.layers):
                    target_layer = language_model.layers[layer]

            if target_layer is None:
                return 0.0

            patch_hook = target_layer.register_forward_hook(patching_hook)

            # FIXED: Use the same robust label creation as in main function
            def create_labels(input_ids, pad_token_id):
                labels = input_ids.clone()
                labels[labels == pad_token_id] = -100
                if labels.size(1) > 1:
                    labels[:, 0] = -100
                return labels

            # Forward pass with patched activations
            patched_loss = 0.0
            with torch.no_grad():
                if "paligemma" in model_name.lower():
                    labels = create_labels(inputs['input_ids'], tokenizer.pad_token_id)
                    patched_outputs = language_model(**inputs)

                    if hasattr(patched_outputs, 'logits'):
                        logits = patched_outputs.logits

                        if logits.size(1) > 1 and labels.size(1) > 1:
                            shift_logits = logits[..., :-1, :].contiguous()
                            shift_labels = labels[..., 1:].contiguous()

                            valid_mask = shift_labels != -100
                            if valid_mask.any():
                                shift_logits = shift_logits.view(-1, shift_logits.size(-1))
                                shift_labels = shift_labels.view(-1)

                                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
                                patched_loss = loss_fct(shift_logits, shift_labels).item()
                    else:
                        patched_loss = 0.0
                else:
                    labels = create_labels(inputs['input_ids'], tokenizer.pad_token_id)
                    patched_outputs = language_model(**inputs, labels=labels)

                    if hasattr(patched_outputs, 'loss') and patched_outputs.loss is not None:
                        patched_loss = patched_outputs.loss.item()
                    else:
                        if hasattr(patched_outputs, 'logits'):
                            logits = patched_outputs.logits

                            if logits.size(1) > 1 and labels.size(1) > 1:
                                shift_logits = logits[..., :-1, :].contiguous()
                                shift_labels = labels[..., 1:].contiguous()

                                valid_mask = shift_labels != -100
                                if valid_mask.any():
                                    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
                                    shift_labels = shift_labels.view(-1)

                                    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
                                    patched_loss = loss_fct(shift_logits, shift_labels).item()

            patch_hook.remove()
            return patched_loss

        except Exception as e:
            print(f"⚠️  Patching failed: {e}")
            return 0.0

    def compute_sae_metrics(self, activations: torch.Tensor, sae: SAE, model_delta_loss: float) -> SAEMetrics:
        """Compute comprehensive SAE evaluation metrics including model delta loss and top-20 features."""
        with torch.no_grad():
            # Reshape activations for SAE processing
            batch_size, seq_len, d_model = activations.shape
            flat_activations = activations.view(-1, d_model)
            
            # Forward pass through SAE
            sae_output = sae(flat_activations) 
            
            # Handle different SAE output formats
            if hasattr(sae_output, 'feature_acts'):
                feature_acts = sae_output.feature_acts # shape (batch_size * seq_len,  latent_dim)
                reconstructed = sae_output.sae_out
            elif isinstance(sae_output, tuple) and len(sae_output) >= 2:
                reconstructed, feature_acts = sae_output[0], sae_output[1]
            elif hasattr(sae, 'encode') and hasattr(sae, 'decode'):
                feature_acts = sae.encode(flat_activations)
                reconstructed = sae.decode(feature_acts)
            else:
                reconstructed = sae_output
                if hasattr(sae, 'W_enc') and hasattr(sae, 'b_enc'):
                    feature_acts = torch.relu(flat_activations @ sae.W_enc + sae.b_enc)
                else:
                    print(f"Failed retrieving SAE reconstructions, random intialisign...")
                    feature_acts = torch.randn(flat_activations.shape[0], 16384, device=flat_activations.device)
            
            # 1. Reconstruction Loss (MSE)
            reconstruction_loss = torch.nn.functional.mse_loss(reconstructed, flat_activations).item()
            
            # 2. L0 Sparsity (fraction of non-zero features)
            l0_sparsity = (feature_acts > 0).float().mean().item()
            
            # 3. L1 Sparsity (mean absolute activation)
            l1_sparsity = feature_acts.abs().mean().item()
            
            # 4. Fraction of features that are ever active
            fraction_alive = (feature_acts.max(dim=0)[0] > 0).float().mean().item()
            
            # 5. Mean maximum activation per sample
            mean_max_activation = feature_acts.max(dim=1)[0].mean().item()
            
            # 6. Reconstruction score (explained variance)
            var_original = flat_activations.var(dim=0).mean()
            var_residual = (flat_activations - reconstructed).var(dim=0).mean()
            reconstruction_score = max(0.0, 1 - (var_residual / var_original).item())
            
            # Store top-20 features for analysis
            mean_feature_acts = feature_acts.mean(dim=0)  # Average across all tokens/samples
            top_20_indices = torch.topk(mean_feature_acts, k=min(20, feature_acts.size(-1)))[1]
            self._store_top_features(top_20_indices, mean_feature_acts, 
                                   reconstruction_loss, l0_sparsity, model_delta_loss)
            
            # top-20 rec loss
            top_acts = feature_acts[..., top_20_indices] # shape (batch_size * seq_len,  latent_dim)
            if hasattr(sae, 'decode'):
                recon_from_topk = sae.decode( top_acts )  # if your SAE supports that
            else:
                if hasattr(sae, 'W_dec') and hasattr(sae, 'b_dec'):
                    recon_from_topk = torch.relu(top_acts @ sae.W_dec + sae.b_dec)
            
            rec_loss_topk = F.mse_loss(recon_from_topk, flat_activations).item()
            
            return SAEMetrics(
                reconstruction_loss=reconstruction_loss,
                l0_sparsity=l0_sparsity,
                l1_sparsity=l1_sparsity,
                fraction_alive=fraction_alive,
                mean_max_activation=mean_max_activation,
                reconstruction_score=reconstruction_score,
                model_delta_loss=model_delta_loss,
                rec_loss_topk=rec_loss_topk
            )
    
    def _store_top_features(self, top_indices: torch.Tensor, feature_acts: torch.Tensor, 
                           recon_loss: float, sparsity: float, delta_loss: float):
        """Store top-20 activated features for analysis."""
        if not hasattr(self, 'top_features_log'):
            self.top_features_log = []
        
        top_features_info = {
            'top_20_indices': top_indices.cpu().tolist(),
            'top_20_activations': feature_acts[top_indices].cpu().tolist(),
            'reconstruction_loss': recon_loss,
            'sparsity': sparsity,
            'delta_loss': delta_loss,
            'timestamp': len(self.top_features_log)  # Simple counter
        }
        
        self.top_features_log.append(top_features_info)

    def analyze_layer_sweep(self, 
                           model1_name: str, 
                           model2_name: str, 
                           texts: List[str],
                           layers: List[int] = None) -> Dict:
        """
        Perform memory-efficient layer sweep analysis.
        
        Args:
            model1_name: First model (base LLM)
            model2_name: Second model (VLM) 
            texts: List of texts to analyze
            layers: List of layers to analyze (default: [8, 12, 16, 20])
        """
        if layers is None:
            layers = [8, 12, 16, 20]  # Sample layers across the model
        
        print(f"🚀 Starting Layer Sweep Analysis")
        print(f"   Model 1: {model1_name}")
        print(f"   Model 2: {model2_name}")
        print(f"   Layers: {layers}")
        print(f"   Texts: {len(texts)} samples")
        print(f"   Memory: {torch.cuda.memory_allocated() / 1e9:.2f}GB" if torch.cuda.is_available() else "")
        
        results = {
            'layers': layers,
            'layer_results': {},
            'texts': texts[:10]  # Store subset for reference
        }
        
        for layer in tqdm(layers, desc="Processing layers"):
            print(f"\n📊 Processing Layer {layer}")
            
            # Load SAE for this layer
            sae = self.get_gemmascope_sae(layer)
            
            layer_metrics = {
                'model1_metrics': [],
                'model2_metrics': [],
                'shift_metrics': []
            }
            
            # Process subset of texts for each layer (memory efficiency)
            sample_texts = texts[:100]  # Process 100 texts per layer (increased from 20)
            
            for i, text in enumerate(tqdm(sample_texts, desc=f"Layer {layer} texts", leave=False)):
                try:
                    # Extract activations and compute metrics for model 1
                    acts1, delta_loss1 = self.extract_activations_with_patching(
                        model1_name, text, layer, sae
                    )
                    metrics1 = self.compute_sae_metrics(acts1, sae, delta_loss1)
                    
                    # Extract activations and compute metrics for model 2
                    acts2, delta_loss2 = self.extract_activations_with_patching(
                        model2_name, text, layer, sae
                    )
                    metrics2 = self.compute_sae_metrics(acts2, sae, delta_loss2)
                    
                    # Compute representation shift
                    shift = self.compute_representation_shift(acts1, acts2, sae)
                    
                    layer_metrics['model1_metrics'].append(metrics1)
                    layer_metrics['model2_metrics'].append(metrics2)
                    layer_metrics['shift_metrics'].append(shift)
                    
                except Exception as e:
                    print(f"⚠️  Error processing text {i} in layer {layer}: {e}")
                    continue
            
            # Compute layer-level aggregates
            layer_metrics['aggregate'] = self._compute_layer_aggregate(layer_metrics)
            results['layer_results'][layer] = layer_metrics
            
            # Memory cleanup
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                print(f"   Memory after layer {layer}: {torch.cuda.memory_allocated() / 1e9:.2f}GB")
        
        # Compute overall analysis
        results['overall_analysis'] = self._compute_overall_analysis(results)
        
        return results

    def compute_representation_shift(self, 
                                   activations1: torch.Tensor, 
                                   activations2: torch.Tensor,
                                   sae: SAE) -> RepresentationShift:
        """Compute representation shift metrics using SAE features."""
        with torch.no_grad():
            # Process both activation sets through SAE
            flat_acts1 = activations1.view(-1, activations1.size(-1))
            flat_acts2 = activations2.view(-1, activations2.size(-1))
            
            # Get SAE features
            def extract_features(flat_acts):
                sae_output = sae(flat_acts)
                if hasattr(sae_output, 'feature_acts'):
                    return sae_output.feature_acts
                elif isinstance(sae_output, tuple) and len(sae_output) >= 2:
                    return sae_output[1]
                elif hasattr(sae, 'encode'):
                    return sae.encode(flat_acts)
                else:
                    if hasattr(sae, 'W_enc') and hasattr(sae, 'b_enc'):
                        return torch.relu(flat_acts @ sae.W_enc + sae.b_enc)
                    else:
                        return torch.randn(flat_acts.shape[0], 16384, device=flat_acts.device)
            
            features1 = extract_features(flat_acts1)
            features2 = extract_features(flat_acts2)
            
            # 1. Cosine similarity
            cosine_sim = torch.nn.functional.cosine_similarity(
                features1.mean(dim=0), features2.mean(dim=0), dim=0
            ).item()
            
            # 2. L2 distance
            l2_distance = torch.norm(features1.mean(dim=0) - features2.mean(dim=0), p=2).item()
            
            # 3. Feature overlap (Jaccard similarity)
            active1 = (features1 > 0).float()
            active2 = (features2 > 0).float()
            intersection = (active1 * active2).sum(dim=0)
            union = torch.clamp(active1.sum(dim=0) + active2.sum(dim=0) - intersection, min=1)
            feature_overlap = (intersection / union).mean().item()
            
            # 4. Jensen-Shannon divergence
            def js_divergence(p, q):
                p = p + 1e-8
                q = q + 1e-8
                p = p / p.sum()
                q = q / q.sum()
                m = 0.5 * (p + q)
                return 0.5 * (torch.nn.functional.kl_div(p.log(), m, reduction='sum') + 
                             torch.nn.functional.kl_div(q.log(), m, reduction='sum'))
            
            p = features1.mean(dim=0).abs()
            q = features2.mean(dim=0).abs()
            js_div = js_divergence(p, q).item()
            
            # 5. Feature correlation
            try:
                corr_matrix = torch.corrcoef(torch.stack([
                    features1.mean(dim=0), features2.mean(dim=0)
                ]))
                feature_correlation = corr_matrix[0, 1].item() if not torch.isnan(corr_matrix[0, 1]) else 0.0
            except:
                feature_correlation = 0.0
            
            return RepresentationShift(
                cosine_similarity=cosine_sim,
                l2_distance=l2_distance,
                feature_overlap=feature_overlap,
                js_divergence=js_div,
                feature_correlation=feature_correlation
            )

    def _compute_layer_aggregate(self, layer_metrics: Dict) -> Dict:
        """Compute aggregate statistics for a single layer."""
        n_samples = len(layer_metrics['model1_metrics'])
        if n_samples == 0:
            return {}
        
        # Average metrics across samples
        avg_model1 = {}
        avg_model2 = {}
        avg_shift = {}
        
        for field in SAEMetrics.__dataclass_fields__:
            avg_model1[field] = np.mean([getattr(m, field) for m in layer_metrics['model1_metrics']])
            avg_model2[field] = np.mean([getattr(m, field) for m in layer_metrics['model2_metrics']])
        
        for field in RepresentationShift.__dataclass_fields__:
            avg_shift[field] = np.mean([getattr(s, field) for s in layer_metrics['shift_metrics']])
        
        return {
            'avg_model1_metrics': avg_model1,
            'avg_model2_metrics': avg_model2,
            'avg_shift_metrics': avg_shift,
            'n_samples': n_samples
        }

    def _compute_overall_analysis(self, results: Dict) -> Dict:
        """Compute overall analysis across all layers."""
        layers = results['layers']
        
        # Collect metrics across layers
        layer_similarities = []
        layer_overlaps = []
        layer_delta_losses = []
        layer_sparsities = []
        layer_rec_loss_topk = []
        
        for layer in layers:
            if layer in results['layer_results'] and 'aggregate' in results['layer_results'][layer]:
                agg = results['layer_results'][layer]['aggregate']
                if agg:  # Check if aggregate is not empty
                    layer_similarities.append(agg['avg_shift_metrics']['cosine_similarity'])
                    layer_overlaps.append(agg['avg_shift_metrics']['feature_overlap'])
                    layer_delta_losses.append(abs(agg['avg_model1_metrics']['model_delta_loss'] - 
                                                 agg['avg_model2_metrics']['model_delta_loss']))
                    layer_sparsities.append((agg['avg_model1_metrics']['l0_sparsity'] + 
                                           agg['avg_model2_metrics']['l0_sparsity']) / 2)

        
        # Overall insights
        overall = {
            'most_similar_layer': layers[np.argmax(layer_similarities)] if layer_similarities else None,
            'most_different_layer': layers[np.argmin(layer_similarities)] if layer_similarities else None,
            'highest_overlap_layer': layers[np.argmax(layer_overlaps)] if layer_overlaps else None,
            'highest_delta_loss_layer': layers[np.argmax(layer_delta_losses)] if layer_delta_losses else None,
            'avg_similarity_across_layers': np.mean(layer_similarities) if layer_similarities else 0,
            'avg_overlap_across_layers': np.mean(layer_overlaps) if layer_overlaps else 0,
            'avg_delta_loss_across_layers': np.mean(layer_delta_losses) if layer_delta_losses else 0,
            'layer_similarities': dict(zip(layers, layer_similarities)),
            'layer_overlaps': dict(zip(layers, layer_overlaps))
        }
        
        return overall

    def visualize_layer_sweep_results(self, results: Dict, model1_name: str, model2_name: str):
        """Create comprehensive visualization of layer sweep results."""
        layers = results['layers']
        
        # Create output filename
        model1_clean = model1_name.replace('/', '_').replace('-', '_')
        model2_clean = model2_name.replace('/', '_').replace('-', '_')
        save_path = self.output_dir / f"{model1_clean}_{model2_clean}_layer_sweep.png"
        
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        fig.suptitle(f'SAE Layer Sweep Analysis: {model1_name} vs {model2_name}', fontsize=16)
        
        # Collect data across layers
        layer_data = {
            'similarities': [],
            'overlaps': [],
            'recon_losses_m1': [],
            'recon_losses_m2': [],
            'sparsities_m1': [],
            'sparsities_m2': [],
            'delta_losses_m1': [],
            'delta_losses_m2': []
        }
        
        for layer in layers:
            if layer in results['layer_results'] and 'aggregate' in results['layer_results'][layer]:
                agg = results['layer_results'][layer]['aggregate']
                if agg:
                    layer_data['similarities'].append(agg['avg_shift_metrics']['cosine_similarity'])
                    layer_data['overlaps'].append(agg['avg_shift_metrics']['feature_overlap'])
                    layer_data['recon_losses_m1'].append(agg['avg_model1_metrics']['reconstruction_loss'])
                    layer_data['recon_losses_m2'].append(agg['avg_model2_metrics']['reconstruction_loss'])
                    layer_data['sparsities_m1'].append(agg['avg_model1_metrics']['l0_sparsity'])
                    layer_data['sparsities_m2'].append(agg['avg_model2_metrics']['l0_sparsity'])
                    layer_data['delta_losses_m1'].append(agg['avg_model1_metrics']['model_delta_loss'])
                    layer_data['delta_losses_m2'].append(agg['avg_model2_metrics']['model_delta_loss'])
        
        # Plot 1: Representation Similarity Across Layers
        axes[0, 0].plot(layers, layer_data['similarities'], 'o-', linewidth=2, markersize=8)
        axes[0, 0].set_title('Cosine Similarity Across Layers')
        axes[0, 0].set_xlabel('Layer')
        axes[0, 0].set_ylabel('Cosine Similarity')
        axes[0, 0].grid(True, alpha=0.3)
        axes[0, 0].axhline(y=0.8, color='red', linestyle='--', alpha=0.5, label='High Similarity')
        axes[0, 0].legend()
        
        # Plot 2: Feature Overlap Across Layers
        axes[0, 1].plot(layers, layer_data['overlaps'], 'o-', color='green', linewidth=2, markersize=8)
        axes[0, 1].set_title('Feature Overlap Across Layers')
        axes[0, 1].set_xlabel('Layer')
        axes[0, 1].set_ylabel('Feature Overlap')
        axes[0, 1].grid(True, alpha=0.3)
        axes[0, 1].axhline(y=0.5, color='red', linestyle='--', alpha=0.5, label='Moderate Overlap')
        axes[0, 1].legend()
        
        # Plot 3: Reconstruction Loss Comparison
        axes[0, 2].plot(layers, layer_data['recon_losses_m1'], 'o-', label='Model 1 (LLM)', linewidth=2)
        axes[0, 2].plot(layers, layer_data['recon_losses_m2'], 's-', label='Model 2 (VLM)', linewidth=2)
        axes[0, 2].set_title('Reconstruction Loss Across Layers')
        axes[0, 2].set_xlabel('Layer')
        axes[0, 2].set_ylabel('Reconstruction Loss')
        axes[0, 2].legend()
        axes[0, 2].grid(True, alpha=0.3)
        
        # Plot 4: Sparsity Comparison
        axes[1, 0].plot(layers, layer_data['sparsities_m1'], 'o-', label='Model 1 (LLM)', linewidth=2)
        axes[1, 0].plot(layers, layer_data['sparsities_m2'], 's-', label='Model 2 (VLM)', linewidth=2)
        axes[1, 0].set_title('L0 Sparsity Across Layers')
        axes[1, 0].set_xlabel('Layer')
        axes[1, 0].set_ylabel('L0 Sparsity')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # Plot 5: Model Delta Loss (Patching Performance)
        axes[1, 1].plot(layers, layer_data['delta_losses_m1'], 'o-', label='Model 1 (LLM)', linewidth=2)
        axes[1, 1].plot(layers, layer_data['delta_losses_m2'], 's-', label='Model 2 (VLM)', linewidth=2)
        axes[1, 1].set_title('Model Delta Loss (Patching Quality)')
        axes[1, 1].set_xlabel('Layer')
        axes[1, 1].set_ylabel('Delta Loss')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        axes[1, 1].axhline(y=0, color='black', linestyle='-', alpha=0.3)
        
        # Plot 6: Summary Heatmap
        # Create a summary matrix for visualization
        metrics_matrix = np.array([
            layer_data['similarities'],
            layer_data['overlaps'],
            np.array(layer_data['recon_losses_m1']) / max(max(layer_data['recon_losses_m1']), 1e-6),  # Normalize
            np.array(layer_data['sparsities_m1']) * 10,  # Scale up for visibility
        ])
        
        im = axes[1, 2].imshow(metrics_matrix, cmap='RdYlBu_r', aspect='auto')
        axes[1, 2].set_title('Metrics Heatmap Across Layers')
        axes[1, 2].set_xlabel('Layer Index')
        axes[1, 2].set_yticks(range(4))
        axes[1, 2].set_yticklabels(['Similarity', 'Overlap', 'Recon Loss (norm)', 'Sparsity (x10)'])
        axes[1, 2].set_xticks(range(len(layers)))
        axes[1, 2].set_xticklabels([f'L{l}' for l in layers])
        
        # Add colorbar
        cbar = plt.colorbar(im, ax=axes[1, 2])
        cbar.set_label('Metric Value')
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ Layer sweep visualization saved to {save_path}")
        
        # Save detailed results as JSON including top features
        json_path = self.output_dir / f"{model1_clean}_{model2_clean}_results.json"
        
        # Convert results to JSON-serializable format
        json_results = {
            'layers': layers,
            'overall_analysis': results['overall_analysis'],
            'layer_summaries': {},
            'top_features_analysis': getattr(self, 'top_features_log', [])
        }
        
        for layer in layers:
            if layer in results['layer_results'] and 'aggregate' in results['layer_results'][layer]:
                agg = results['layer_results'][layer]['aggregate']
                if agg:
                    json_results['layer_summaries'][str(layer)] = agg
        
        with open(json_path, 'w') as f:
            json.dump(json_results, f, indent=2)
        print(f"✅ Detailed results saved to {json_path}")
        
        # Create top features analysis
        self._analyze_top_features_trends()

    def interpret_layer_sweep_results(self, results: Dict) -> Dict[str, str]:
        """Provide interpretation of layer sweep results."""
        overall = results['overall_analysis']
        interpretations = {}
        
        # Overall adaptation assessment
        avg_similarity = overall['avg_similarity_across_layers']
        if avg_similarity > 0.85:
            interpretations['adaptation_magnitude'] = "✅ MINIMAL LLM→VLM adaptation - representations largely preserved"
        elif avg_similarity > 0.7:
            interpretations['adaptation_magnitude'] = "⚠️ MODERATE LLM→VLM adaptation - selective representational changes"
        else:
            interpretations['adaptation_magnitude'] = "🔍 SIGNIFICANT LLM→VLM adaptation - substantial representational reorganization"
        
        # Layer-specific insights
        if overall['most_different_layer'] is not None:
            interpretations['adaptation_location'] = f"🎯 Layer {overall['most_different_layer']} shows maximum adaptation"
        
        if overall['highest_overlap_layer'] is not None:
            interpretations['feature_preservation'] = f"🔗 Layer {overall['highest_overlap_layer']} best preserves LLM features"
        
        # Adaptation pattern
        layer_sims = list(overall['layer_similarities'].values())
        if len(layer_sims) >= 3:
            early_sim = np.mean(layer_sims[:len(layer_sims)//3])
            late_sim = np.mean(layer_sims[-len(layer_sims)//3:])
            
            if early_sim > late_sim + 0.1:
                interpretations['adaptation_pattern'] = "📈 Early layers preserve LLM representations better than late layers"
            elif late_sim > early_sim + 0.1:
                interpretations['adaptation_pattern'] = "📉 Late layers preserve LLM representations better than early layers"
            else:
                interpretations['adaptation_pattern'] = "📊 Uniform adaptation pattern across layers"
        
        # SAE quality assessment
        avg_delta_loss = overall['avg_delta_loss_across_layers']
        if avg_delta_loss < 0.1:
            interpretations['sae_quality'] = "✅ SAE reconstructions preserve model functionality well"
        elif avg_delta_loss < 0.5:
            interpretations['sae_quality'] = "⚠️ SAE reconstructions cause moderate functional degradation"
        else:
            interpretations['sae_quality'] = "❌ SAE reconstructions significantly impact model functionality"
        
        return interpretations


def main():
    """Main function for comprehensive LLM->VLM representation shift analysis."""
    print("🚀 Comprehensive SAE Layer Sweep Analysis: LLM→VLM Adaptation")
    print("=" * 70)
    
    # Configuration
    MODEL_SIZE = "2b"
    WIDTH = "16k"
    SUFFIX = "canonical"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    LAYERS = [4, 8, 12, 16, 20, 24]  # Sample across the model depth
    
    try:
        # Initialize analyzer
        analyzer = MemoryEfficientSAEAnalyzer(
            model_size=MODEL_SIZE,
            width=WIDTH,
            suffix=SUFFIX,
            device=DEVICE
        )
        
        # Load dataset
        print("\n📚 Loading Datasets...")
        dataset_loader = DatasetLoader(device=DEVICE)
        texts = dataset_loader.get_mixed_dataset(total_samples=1000)  # Use 1K data as requested
        
        print(f"✅ Loaded {len(texts)} texts from mixed datasets")
        print(f"Sample texts: {texts[:3]}")
        
        # Model configuration for LLM->VLM comparison
        model1_name = "google/gemma-2-2b"  # Base Gemma-2-2B (LLM)
        model2_name = "google/paligemma2-3b-pt-224"  # PaliGemma with Gemma-2-2B decoder (VLM)
        
        print(f"\n🔬 Research Configuration:")
        print(f"   Model 1 (LLM): {model1_name}")
        print(f"   Model 2 (VLM): {model2_name}")
        print(f"   Layers to analyze: {LAYERS}")
        print(f"   SAE Configuration: {MODEL_SIZE}-{WIDTH}-{SUFFIX}")
        print(f"   Device: {DEVICE}")
        print(f"   Total texts: {len(texts)}")
        
        # Run layer sweep analysis
        print(f"\n🚀 Starting Layer Sweep Analysis...")
        results = analyzer.analyze_layer_sweep(
            model1_name=model1_name,
            model2_name=model2_name,
            texts=texts,
            layers=LAYERS
        )
        
        # Generate interpretations
        interpretations = analyzer.interpret_layer_sweep_results(results)
        
        print(f"\n📊 LAYER SWEEP RESULTS:")
        print("=" * 50)
        
        overall = results['overall_analysis']
        print(f"Most Similar Layer: {overall['most_similar_layer']}")
        print(f"Most Different Layer: {overall['most_different_layer']}")
        print(f"Highest Feature Overlap Layer: {overall['highest_overlap_layer']}")
        print(f"Average Similarity Across Layers: {overall['avg_similarity_across_layers']:.3f}")
        print(f"Average Feature Overlap: {overall['avg_overlap_across_layers']:.3f}")
        
        print(f"\n🔍 INTERPRETATIONS:")
        print("=" * 50)
        for aspect, interpretation in interpretations.items():
            print(f"{aspect.replace('_', ' ').title()}: {interpretation}")
        
        # Create visualizations
        print(f"\n📈 Generating Visualizations...")
        analyzer.visualize_layer_sweep_results(results, model1_name, model2_name)
        
        print(f"\n✅ Analysis Complete!")
        print(f"📁 Results saved to: {analyzer.output_dir}")
        print(f"🧠 Key Finding: {interpretations.get('adaptation_magnitude', 'Analysis completed')}")
        
        # Memory cleanup
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            print(f"🔧 Final GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f}GB")
        
    except Exception as e:
        print(f"❌ Error during analysis: {e}")
        import traceback
        traceback.print_exc()
        
        print("\n💡 Troubleshooting Tips:")
        print("   1. Ensure sufficient GPU memory (8GB+ recommended)")
        print("   2. Reduce LAYERS list or sample size if out of memory")
        print("   3. Check model names are correct and accessible")
        print("   4. Install required packages: pip install sae-lens transformers datasets")


if __name__ == "__main__":
    main()

# Installation requirements:
"""
pip install sae-lens transformers torch matplotlib seaborn numpy datasets tqdm

# For CUDA support (recommended):
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
"""

  from .autonotebook import tqdm as notebook_tqdm


🚀 Comprehensive SAE Layer Sweep Analysis: LLM→VLM Adaptation
🔧 Initialized SAE Analyzer
   Device: cuda
   Model Size: 2b
   SAE Width: 16k
   Output Dir: ../figs_tabs

📚 Loading Datasets...
✅ Loaded 666 CIFAR-100 captions
❌ Error loading COCO: 'utf-8' codec can't decode byte 0xc4 in position 4: invalid continuation byte
❌ Error loading LLaVA-Bench: Config name is missing.
Please pick one among the available configs: ['CLEVR-Math(MathV360K)', 'Evol-Instruct-GPT4-Turbo', 'FigureQA(MathV360K)', 'GEOS(MathV360K)', 'GeoQA+(MathV360K)', 'Geometry3K(MathV360K)', 'IconQA(MathV360K)', 'MapQA(MathV360K)', 'MathV360K_TQA', 'MathV360K_VQA-AS', 'MathV360K_VQA-RAD', 'PMC-VQA(MathV360K)', 'Super-CLEVR(MathV360K)', 'TabMWP(MathV360K)', 'UniGeo(MathV360K)', 'VisualWebInstruct(filtered)', 'VizWiz(MathV360K)', 'ai2d(cauldron,llava_format)', 'ai2d(gpt4v)', 'ai2d(internvl)', 'allava_instruct_laion4v', 'allava_instruct_vflan4v', 'aokvqa(cauldron,llava_format)', 'chart2text(cauldron)', 'chartqa(cauldron,lla

Processing layers:   0%|          | 0/6 [00:00<?, ?it/s]


📊 Processing Layer 4
   📥 Loading SAE Layer 4: layer_4/width_16k/canonical



Layer 4 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.65it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.14s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.51it/s][A[A

Layer 4 texts:   1%|          | 1/100 [00:13<22:21, 13.55s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 52.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.07s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.57it/s][A[A

Layer 4 texts:   2%|▏         | 2/100 [00:25<20:54, 12.80s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.27it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.37s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.27it/s][A[A

Layer 4 texts:   3%|▎         | 3/100 [00:38<20:44, 12.83s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.62it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.35s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s][A[A

Layer 4 texts:   4%|▍         | 4/100 [00:51<20:24, 12.76s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.39it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.01it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.45it/s][A[A

Layer 4 texts:   5%|▌         | 5/100 [01:03<19:56, 12.59s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.43it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.05s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.63it/s][A[A

Layer 4 texts:   6%|▌         | 6/100 [01:16<19:43, 12.59s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.50it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.03it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.74it/s][A[A

Layer 4 texts:   7%|▋         | 7/100 [01:28<19:23, 12.52s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.08it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A[A

Layer 4 texts:   8%|▊         | 8/100 [01:41<19:14, 12.54s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.65it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.07it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A[A

Layer 4 texts:   9%|▉         | 9/100 [01:53<18:47, 12.39s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.59it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 4 texts:  10%|█         | 10/100 [02:05<18:33, 12.37s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.69it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 4 texts:  11%|█         | 11/100 [02:17<18:15, 12.31s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.07it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A[A

Layer 4 texts:  12%|█▏        | 12/100 [02:29<17:48, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.13it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.08it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.61it/s][A[A

Layer 4 texts:  13%|█▎        | 13/100 [02:41<17:30, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.34it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 4 texts:  14%|█▍        | 14/100 [02:53<17:13, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.29it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 4 texts:  15%|█▌        | 15/100 [03:05<17:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.85it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 4 texts:  16%|█▌        | 16/100 [03:17<16:41, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.64it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 4 texts:  17%|█▋        | 17/100 [03:29<16:31, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.04it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 4 texts:  18%|█▊        | 18/100 [03:41<16:30, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 4 texts:  19%|█▉        | 19/100 [03:53<16:24, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.66it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 4 texts:  20%|██        | 20/100 [04:05<16:09, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.51it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.10s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.56it/s][A[A

Layer 4 texts:  21%|██        | 21/100 [04:18<16:05, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.00it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.70it/s][A[A

Layer 4 texts:  22%|██▏       | 22/100 [04:30<15:47, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.25it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 4 texts:  23%|██▎       | 23/100 [04:42<15:29, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.23it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 4 texts:  24%|██▍       | 24/100 [04:54<15:21, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.46it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 4 texts:  25%|██▌       | 25/100 [05:06<15:08, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.57it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 4 texts:  26%|██▌       | 26/100 [05:18<14:52, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  27%|██▋       | 27/100 [05:30<14:34, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.49it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 4 texts:  28%|██▊       | 28/100 [05:42<14:27, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.29it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.58it/s][A[A

Layer 4 texts:  29%|██▉       | 29/100 [05:54<14:17, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.66it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 4 texts:  30%|███       | 30/100 [06:06<14:09, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.61it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 4 texts:  31%|███       | 31/100 [06:18<13:57, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  32%|███▏      | 32/100 [06:31<13:48, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.89it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  33%|███▎      | 33/100 [06:43<13:39, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.96it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  34%|███▍      | 34/100 [06:55<13:24, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.59it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  35%|███▌      | 35/100 [07:07<13:07, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:  36%|███▌      | 36/100 [07:19<12:49, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.96it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.01s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.66it/s][A[A

Layer 4 texts:  37%|███▋      | 37/100 [07:31<12:39, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.66it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.79it/s][A[A

Layer 4 texts:  38%|███▊      | 38/100 [07:43<12:24, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.73it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 4 texts:  39%|███▉      | 39/100 [07:55<12:17, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.73it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s][A[A

Layer 4 texts:  40%|████      | 40/100 [08:08<12:09, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 49.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 4 texts:  41%|████      | 41/100 [08:20<12:02, 12.25s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.10it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 4 texts:  42%|████▏     | 42/100 [08:32<11:47, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.38it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 4 texts:  43%|████▎     | 43/100 [08:44<11:32, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.35it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 4 texts:  44%|████▍     | 44/100 [08:56<11:14, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.96it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s][A[A

Layer 4 texts:  45%|████▌     | 45/100 [09:08<11:02, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 52.23it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 4 texts:  46%|████▌     | 46/100 [09:20<10:52, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.96it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 4 texts:  47%|████▋     | 47/100 [09:33<10:44, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.51it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 4 texts:  48%|████▊     | 48/100 [09:45<10:31, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.61it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 4 texts:  49%|████▉     | 49/100 [09:57<10:21, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.33it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 4 texts:  50%|█████     | 50/100 [10:09<10:09, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.05it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 4 texts:  51%|█████     | 51/100 [10:21<09:56, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.90it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  52%|█████▏    | 52/100 [10:33<09:41, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.53it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  53%|█████▎    | 53/100 [10:45<09:24, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.85it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 4 texts:  54%|█████▍    | 54/100 [10:57<09:07, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 53.44it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  55%|█████▌    | 55/100 [11:09<09:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 54.48it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  56%|█████▌    | 56/100 [11:21<08:54, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.49it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  57%|█████▋    | 57/100 [11:33<08:39, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.52it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 4 texts:  58%|█████▊    | 58/100 [11:46<08:30, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.88it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  59%|█████▉    | 59/100 [11:58<08:17, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.99it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  60%|██████    | 60/100 [12:09<08:01, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.82it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.02s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s][A[A

Layer 4 texts:  61%|██████    | 61/100 [12:21<07:48, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.15it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 4 texts:  62%|██████▏   | 62/100 [12:33<07:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.21it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 4 texts:  63%|██████▎   | 63/100 [12:46<07:25, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 49.15it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  64%|██████▍   | 64/100 [12:58<07:13, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.42it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  65%|██████▌   | 65/100 [13:10<07:02, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 4 texts:  66%|██████▌   | 66/100 [13:21<06:46, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.62it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  67%|██████▋   | 67/100 [13:33<06:31, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 4 texts:  68%|██████▊   | 68/100 [13:45<06:21, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.32it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  69%|██████▉   | 69/100 [13:57<06:07, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.50it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 4 texts:  70%|███████   | 70/100 [14:09<05:55, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.10it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 4 texts:  71%|███████   | 71/100 [14:21<05:45, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.79it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:  72%|███████▏  | 72/100 [14:33<05:34, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  73%|███████▎  | 73/100 [14:45<05:25, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.39it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  74%|███████▍  | 74/100 [14:57<05:12, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.57it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 4 texts:  75%|███████▌  | 75/100 [15:14<05:39, 13.59s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  76%|███████▌  | 76/100 [15:26<05:15, 13.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.78it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 4 texts:  77%|███████▋  | 77/100 [15:38<04:54, 12.79s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.86it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  78%|███████▊  | 78/100 [15:51<04:37, 12.60s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  79%|███████▉  | 79/100 [16:03<04:24, 12.58s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.36it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  80%|████████  | 80/100 [16:15<04:07, 12.36s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.11it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.35it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.20it/s][A[A

Layer 4 texts:  81%|████████  | 81/100 [16:27<03:51, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.17it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 4 texts:  82%|████████▏ | 82/100 [16:39<03:37, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.99it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 4 texts:  83%|████████▎ | 83/100 [16:50<03:24, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.77it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 4 texts:  84%|████████▍ | 84/100 [17:03<03:13, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.93it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 4 texts:  85%|████████▌ | 85/100 [17:15<03:01, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.14it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  86%|████████▌ | 86/100 [17:27<02:50, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.95it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  87%|████████▋ | 87/100 [17:39<02:37, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.38it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  88%|████████▊ | 88/100 [17:51<02:24, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.70it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.10it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.84it/s][A[A

Layer 4 texts:  89%|████████▉ | 89/100 [18:03<02:13, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.52it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  90%|█████████ | 90/100 [18:15<02:00, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.19it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 4 texts:  91%|█████████ | 91/100 [18:28<01:49, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.73it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 4 texts:  92%|█████████▏| 92/100 [18:40<01:37, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 4 texts:  93%|█████████▎| 93/100 [18:52<01:25, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.83it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  94%|█████████▍| 94/100 [19:04<01:13, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.68it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  95%|█████████▌| 95/100 [19:16<01:00, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.48it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 4 texts:  96%|█████████▌| 96/100 [19:29<00:48, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.61it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  97%|█████████▋| 97/100 [19:41<00:36, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  98%|█████████▊| 98/100 [19:53<00:24, 12.27s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.88it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 4 texts:  99%|█████████▉| 99/100 [20:05<00:12, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 54.02it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts: 100%|██████████| 100/100 [20:17<00:00, 12.06s/it][A
Processing layers:  17%|█▋        | 1/6 [20:21<1:41:47, 1221.48s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
   Memory after layer 4: 12.53GB

📊 Processing Layer 8
   📥 Loading SAE Layer 8: layer_8/width_16k/canonical



Layer 8 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.65it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 8 texts:   1%|          | 1/100 [00:12<19:55, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.30it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s][A[A

Layer 8 texts:   2%|▏         | 2/100 [00:24<19:52, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.08it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 8 texts:   3%|▎         | 3/100 [00:36<19:29, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.48it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 8 texts:   4%|▍         | 4/100 [00:48<19:32, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.08it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 8 texts:   5%|▌         | 5/100 [01:00<19:21, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.13it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 8 texts:   6%|▌         | 6/100 [01:12<18:59, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.77it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.00s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.69it/s][A[A

Layer 8 texts:   7%|▋         | 7/100 [01:25<18:50, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 51.49it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 8 texts:   8%|▊         | 8/100 [01:37<18:32, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 52.79it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 8 texts:   9%|▉         | 9/100 [01:49<18:17, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.60it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 8 texts:  10%|█         | 10/100 [02:00<18:02, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.41it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts:  11%|█         | 11/100 [02:13<17:55, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 8 texts:  12%|█▏        | 12/100 [02:25<17:38, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.51it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 8 texts:  13%|█▎        | 13/100 [02:37<17:30, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.11it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 8 texts:  14%|█▍        | 14/100 [02:49<17:14, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.63it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  15%|█▌        | 15/100 [03:01<17:09, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.33it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 8 texts:  16%|█▌        | 16/100 [03:13<17:00, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.06it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 8 texts:  17%|█▋        | 17/100 [03:25<16:46, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.44it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 8 texts:  18%|█▊        | 18/100 [03:37<16:30, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 54.40it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.02s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s][A[A

Layer 8 texts:  19%|█▉        | 19/100 [03:49<16:21, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.78it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  20%|██        | 20/100 [04:01<16:06, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.80it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.36it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.19it/s][A[A

Layer 8 texts:  21%|██        | 21/100 [04:13<15:46, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.29it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  22%|██▏       | 22/100 [04:25<15:33, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.73it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 8 texts:  23%|██▎       | 23/100 [04:37<15:29, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.52it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.32it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 8 texts:  24%|██▍       | 24/100 [04:50<15:18, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A


⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss



Layer 8 texts:  25%|██▌       | 25/100 [05:02<15:07, 12.10s/it][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 52.04it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 8 texts:  26%|██▌       | 26/100 [05:13<14:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 8 texts:  27%|██▋       | 27/100 [05:26<14:37, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 51.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  28%|██▊       | 28/100 [05:38<14:26, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.37it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 8 texts:  29%|██▉       | 29/100 [05:50<14:16, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.83it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  30%|███       | 30/100 [06:02<14:05, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.54it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 8 texts:  31%|███       | 31/100 [06:14<13:52, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.27it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s][A[A

Layer 8 texts:  32%|███▏      | 32/100 [06:26<13:42, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][A[A

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 12.32it/s][A[A


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.04s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.64it/s][A[A

Layer 8 texts:  33%|███▎      | 33/100 [06:38<13:34, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.56it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 8 texts:  34%|███▍      | 34/100 [06:50<13:18, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.49it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 8 texts:  35%|███▌      | 35/100 [07:02<13:05, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.56it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 8 texts:  36%|███▌      | 36/100 [07:15<13:02, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.87it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  37%|███▋      | 37/100 [07:27<12:48, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.89it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 8 texts:  38%|███▊      | 38/100 [07:39<12:30, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.65it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 8 texts:  39%|███▉      | 39/100 [07:51<12:12, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.21it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.73it/s][A[A

Layer 8 texts:  40%|████      | 40/100 [08:03<12:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.21it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 8 texts:  41%|████      | 41/100 [08:14<11:43, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.45it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 8 texts:  42%|████▏     | 42/100 [08:27<11:37, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.56it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  43%|████▎     | 43/100 [08:39<11:27, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.82it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 8 texts:  44%|████▍     | 44/100 [08:51<11:16, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.38it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 8 texts:  45%|████▌     | 45/100 [09:03<11:01, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.22it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  46%|████▌     | 46/100 [09:15<10:45, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.43it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 8 texts:  47%|████▋     | 47/100 [09:26<10:30, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.22it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.35it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.17it/s][A[A

Layer 8 texts:  48%|████▊     | 48/100 [09:38<10:17, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.44it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 8 texts:  49%|████▉     | 49/100 [09:51<10:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.48it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  50%|█████     | 50/100 [10:03<10:03, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.80it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 8 texts:  51%|█████     | 51/100 [10:15<09:56, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.52it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  52%|█████▏    | 52/100 [10:28<09:46, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.99it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  53%|█████▎    | 53/100 [10:40<09:35, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.74it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts:  54%|█████▍    | 54/100 [10:52<09:18, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 45.89it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.70it/s][A[A

Layer 8 texts:  55%|█████▌    | 55/100 [11:04<09:05, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 54.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts:  56%|█████▌    | 56/100 [11:16<08:50, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.66it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  57%|█████▋    | 57/100 [11:28<08:41, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.03it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  58%|█████▊    | 58/100 [11:40<08:26, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.02it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  59%|█████▉    | 59/100 [11:52<08:14, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.02it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 8 texts:  60%|██████    | 60/100 [12:04<08:00, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.29it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  61%|██████    | 61/100 [12:16<07:44, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.90it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 8 texts:  62%|██████▏   | 62/100 [12:27<07:30, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.07it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 8 texts:  63%|██████▎   | 63/100 [12:39<07:21, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.04it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 8 texts:  64%|██████▍   | 64/100 [12:52<07:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.84it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 8 texts:  65%|██████▌   | 65/100 [13:04<07:02, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.70it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 8 texts:  66%|██████▌   | 66/100 [13:16<06:52, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  67%|██████▋   | 67/100 [13:28<06:38, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.83it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 8 texts:  68%|██████▊   | 68/100 [13:40<06:22, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.42it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.35it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.74it/s][A[A

Layer 8 texts:  69%|██████▉   | 69/100 [13:52<06:12, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 8 texts:  70%|███████   | 70/100 [14:04<06:04, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.78it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 8 texts:  71%|███████   | 71/100 [14:16<05:51, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.03it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 8 texts:  72%|███████▏  | 72/100 [14:28<05:39, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.74it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 8 texts:  73%|███████▎  | 73/100 [14:41<05:28, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.93it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  74%|███████▍  | 74/100 [14:53<05:16, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 8 texts:  75%|███████▌  | 75/100 [15:05<05:05, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.49it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 8 texts:  76%|███████▌  | 76/100 [15:18<04:54, 12.26s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.94it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts:  77%|███████▋  | 77/100 [15:30<04:41, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.71it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A


⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss



Layer 8 texts:  78%|███████▊  | 78/100 [15:42<04:27, 12.16s/it][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 54.37it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 8 texts:  79%|███████▉  | 79/100 [15:54<04:13, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.95it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.65it/s][A[A

Layer 8 texts:  80%|████████  | 80/100 [16:06<04:01, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.38it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.14it/s][A[A

Layer 8 texts:  81%|████████  | 81/100 [16:18<03:48, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.00it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 8 texts:  82%|████████▏ | 82/100 [16:30<03:36, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.35it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 8 texts:  83%|████████▎ | 83/100 [16:42<03:26, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.00it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  84%|████████▍ | 84/100 [16:54<03:14, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 54.16it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 8 texts:  85%|████████▌ | 85/100 [17:06<03:01, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 31.33it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.09it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.83it/s][A[A

Layer 8 texts:  86%|████████▌ | 86/100 [17:18<02:49, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.57it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.05it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s][A[A

Layer 8 texts:  87%|████████▋ | 87/100 [17:30<02:36, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.16it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 8 texts:  88%|████████▊ | 88/100 [17:42<02:24, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.00it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  89%|████████▉ | 89/100 [17:55<02:12, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.69it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 8 texts:  90%|█████████ | 90/100 [18:07<02:01, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  91%|█████████ | 91/100 [18:19<01:49, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.06it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 8 texts:  92%|█████████▏| 92/100 [18:31<01:36, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.67it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 8 texts:  93%|█████████▎| 93/100 [18:43<01:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.59it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 8 texts:  94%|█████████▍| 94/100 [18:55<01:11, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.92it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  95%|█████████▌| 95/100 [19:06<00:59, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.32it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  96%|█████████▌| 96/100 [19:19<00:48, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.68it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  97%|█████████▋| 97/100 [19:31<00:36, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.51it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 8 texts:  98%|█████████▊| 98/100 [19:43<00:24, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.02it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 8 texts:  99%|█████████▉| 99/100 [19:55<00:12, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.44it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts: 100%|██████████| 100/100 [20:07<00:00, 12.06s/it][A
Processing layers:  33%|███▎      | 2/6 [40:29<1:20:54, 1213.72s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
   Memory after layer 8: 12.83GB

📊 Processing Layer 12
   📥 Loading SAE Layer 12: layer_12/width_16k/canonical



Layer 12 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.07it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.81it/s][A[A

Layer 12 texts:   1%|          | 1/100 [00:11<19:36, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.96it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.09it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.84it/s][A[A

Layer 12 texts:   2%|▏         | 2/100 [00:23<19:32, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.36it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:   3%|▎         | 3/100 [00:35<19:18, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.05it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:   4%|▍         | 4/100 [00:48<19:16, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 12 texts:   5%|▌         | 5/100 [01:00<19:08, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.66it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 12 texts:   6%|▌         | 6/100 [01:12<18:49, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.15it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:   7%|▋         | 7/100 [01:23<18:31, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.00it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 12 texts:   8%|▊         | 8/100 [01:35<18:19, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s][A[A

Layer 12 texts:   9%|▉         | 9/100 [01:47<18:12, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.33it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.17it/s][A[A

Layer 12 texts:  10%|█         | 10/100 [01:59<17:54, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.11it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 12 texts:  11%|█         | 11/100 [02:11<17:50, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.70it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  12%|█▏        | 12/100 [02:24<17:41, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.87it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 12 texts:  13%|█▎        | 13/100 [02:35<17:21, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.55it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 12 texts:  14%|█▍        | 14/100 [02:47<17:13, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.82it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 12 texts:  15%|█▌        | 15/100 [03:00<17:01, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.18it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  16%|█▌        | 16/100 [03:12<16:55, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.13it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 12 texts:  17%|█▋        | 17/100 [03:24<16:57, 12.26s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.76it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  18%|█▊        | 18/100 [03:37<16:49, 12.32s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 12 texts:  19%|█▉        | 19/100 [03:49<16:22, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.78it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  20%|██        | 20/100 [04:00<16:04, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.43it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.07it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A[A

Layer 12 texts:  21%|██        | 21/100 [04:12<15:50, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.96it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  22%|██▏       | 22/100 [04:24<15:38, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.37it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 12 texts:  23%|██▎       | 23/100 [04:37<15:27, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.08it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 12 texts:  24%|██▍       | 24/100 [04:48<15:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.57it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.32it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.65it/s][A[A

Layer 12 texts:  25%|██▌       | 25/100 [05:00<15:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.76it/s][A[A

Layer 12 texts:  26%|██▌       | 26/100 [05:12<14:48, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.94it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 12 texts:  27%|██▋       | 27/100 [05:24<14:30, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.39it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  28%|██▊       | 28/100 [05:37<14:27, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.27it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  29%|██▉       | 29/100 [05:49<14:17, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  30%|███       | 30/100 [06:01<14:03, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.14it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.11s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.56it/s][A[A

Layer 12 texts:  31%|███       | 31/100 [06:13<13:54, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.92it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.01it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.72it/s][A[A

Layer 12 texts:  32%|███▏      | 32/100 [06:25<13:43, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.41it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.02s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s][A[A

Layer 12 texts:  33%|███▎      | 33/100 [06:37<13:32, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.79it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 12 texts:  34%|███▍      | 34/100 [06:49<13:14, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.61it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  35%|███▌      | 35/100 [07:01<13:06, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.41it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  36%|███▌      | 36/100 [07:13<12:56, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.48it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 12 texts:  37%|███▋      | 37/100 [07:26<12:44, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.34it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 12 texts:  38%|███▊      | 38/100 [07:37<12:27, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.46it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 12 texts:  39%|███▉      | 39/100 [07:49<12:13, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.78it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s][A[A

Layer 12 texts:  40%|████      | 40/100 [08:02<12:03, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.45it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.10it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 12 texts:  41%|████      | 41/100 [08:14<11:49, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.73it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:  42%|████▏     | 42/100 [08:26<11:40, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 12 texts:  43%|████▎     | 43/100 [08:38<11:36, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.76it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  44%|████▍     | 44/100 [08:50<11:24, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  45%|████▌     | 45/100 [09:03<11:10, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:  46%|████▌     | 46/100 [09:14<10:52, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.34it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 12 texts:  47%|████▋     | 47/100 [09:26<10:40, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.22it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.06it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s][A[A

Layer 12 texts:  48%|████▊     | 48/100 [09:39<10:27, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.04it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  49%|████▉     | 49/100 [09:51<10:18, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.93it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  50%|█████     | 50/100 [10:03<10:05, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.30it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  51%|█████     | 51/100 [10:15<09:53, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:  52%|█████▏    | 52/100 [10:27<09:44, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 12 texts:  53%|█████▎    | 53/100 [10:39<09:29, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.89it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s][A[A

Layer 12 texts:  54%|█████▍    | 54/100 [10:51<09:17, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.45it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  55%|█████▌    | 55/100 [11:03<09:01, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.47it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 12 texts:  56%|█████▌    | 56/100 [11:16<08:55, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.16it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  57%|█████▋    | 57/100 [11:28<08:40, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.82it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 12 texts:  58%|█████▊    | 58/100 [11:40<08:31, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.38it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 12 texts:  59%|█████▉    | 59/100 [11:52<08:21, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.38it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 12 texts:  60%|██████    | 60/100 [12:05<08:09, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  61%|██████    | 61/100 [12:17<07:53, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 12 texts:  62%|██████▏   | 62/100 [12:28<07:37, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.61it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.36it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.20it/s][A[A

Layer 12 texts:  63%|██████▎   | 63/100 [12:40<07:21, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.76it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 12 texts:  64%|██████▍   | 64/100 [12:52<07:10, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  65%|██████▌   | 65/100 [13:04<07:01, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 12 texts:  66%|██████▌   | 66/100 [13:17<06:52, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 12 texts:  67%|██████▋   | 67/100 [13:29<06:39, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.06it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 12 texts:  68%|██████▊   | 68/100 [13:41<06:25, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.47it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 12 texts:  69%|██████▉   | 69/100 [13:52<06:09, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.63it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.04s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.64it/s][A[A

Layer 12 texts:  70%|███████   | 70/100 [14:05<06:03, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.98it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  71%|███████   | 71/100 [14:17<05:51, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.33it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 12 texts:  72%|███████▏  | 72/100 [14:29<05:38, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.78it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  73%|███████▎  | 73/100 [14:41<05:26, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.64it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  74%|███████▍  | 74/100 [14:54<05:18, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.03it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  75%|███████▌  | 75/100 [15:06<05:07, 12.30s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.74it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  76%|███████▌  | 76/100 [15:18<04:55, 12.32s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.90it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  77%|███████▋  | 77/100 [15:30<04:41, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.98it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.05it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.77it/s][A[A

Layer 12 texts:  78%|███████▊  | 78/100 [15:42<04:27, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.36it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  79%|███████▉  | 79/100 [15:55<04:16, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.19it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  80%|████████  | 80/100 [16:07<04:04, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.71it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 12 texts:  81%|████████  | 81/100 [16:19<03:51, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.36it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  82%|████████▏ | 82/100 [16:32<03:40, 12.26s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.61it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 12 texts:  83%|████████▎ | 83/100 [16:44<03:27, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.20it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 12 texts:  84%|████████▍ | 84/100 [16:56<03:14, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.85it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 12 texts:  85%|████████▌ | 85/100 [17:07<03:00, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.56it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 12 texts:  86%|████████▌ | 86/100 [17:20<02:49, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.51it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 12 texts:  87%|████████▋ | 87/100 [17:32<02:37, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.91it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  88%|████████▊ | 88/100 [17:44<02:25, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  89%|████████▉ | 89/100 [17:56<02:13, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.82it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 12 texts:  90%|█████████ | 90/100 [18:08<02:00, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.03it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.75it/s][A[A

Layer 12 texts:  91%|█████████ | 91/100 [18:20<01:48, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.34it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.33it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.66it/s][A[A

Layer 12 texts:  92%|█████████▏| 92/100 [18:33<01:37, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 12 texts:  93%|█████████▎| 93/100 [18:44<01:24, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.13it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 12 texts:  94%|█████████▍| 94/100 [18:56<01:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.96it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts:  95%|█████████▌| 95/100 [19:09<01:00, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.50it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 12 texts:  96%|█████████▌| 96/100 [19:20<00:48, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.93it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 12 texts:  97%|█████████▋| 97/100 [19:32<00:35, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.87it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 12 texts:  98%|█████████▊| 98/100 [19:44<00:23, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.24it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 12 texts:  99%|█████████▉| 99/100 [19:56<00:11, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.90it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 12 texts: 100%|██████████| 100/100 [20:08<00:00, 11.94s/it][A
Processing layers:  50%|█████     | 3/6 [1:00:39<1:00:35, 1211.75s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
   Memory after layer 12: 12.83GB

📊 Processing Layer 16
   📥 Loading SAE Layer 16: layer_16/width_16k/canonical



Layer 16 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.76it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 16 texts:   1%|          | 1/100 [00:11<19:26, 11.78s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.73it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 16 texts:   2%|▏         | 2/100 [00:23<19:23, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.57it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 16 texts:   3%|▎         | 3/100 [00:35<19:20, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.27it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 16 texts:   4%|▍         | 4/100 [00:47<19:12, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 16 texts:   5%|▌         | 5/100 [00:59<18:56, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.62it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 16 texts:   6%|▌         | 6/100 [01:11<18:40, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.15it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 16 texts:   7%|▋         | 7/100 [01:23<18:28, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.68it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 16 texts:   8%|▊         | 8/100 [01:35<18:13, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:   9%|▉         | 9/100 [01:47<17:58, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.38it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 16 texts:  10%|█         | 10/100 [01:59<17:56, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.27it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.32it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.14it/s][A[A

Layer 16 texts:  11%|█         | 11/100 [02:11<17:40, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.27it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 16 texts:  12%|█▏        | 12/100 [02:22<17:24, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.74it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 16 texts:  13%|█▎        | 13/100 [02:34<17:15, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  14%|█▍        | 14/100 [02:46<17:08, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.82it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts:  15%|█▌        | 15/100 [02:58<16:55, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:  16%|█▌        | 16/100 [03:10<16:42, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.16it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  17%|█▋        | 17/100 [03:22<16:26, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.62it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 16 texts:  18%|█▊        | 18/100 [03:34<16:11, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.36it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 16 texts:  19%|█▉        | 19/100 [03:46<16:00, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.66it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 16 texts:  20%|██        | 20/100 [03:58<15:48, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 16 texts:  21%|██        | 21/100 [04:09<15:36, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 16 texts:  22%|██▏       | 22/100 [04:21<15:26, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.18it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:  23%|██▎       | 23/100 [04:33<15:12, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.01it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 16 texts:  24%|██▍       | 24/100 [04:45<15:00, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.88it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 16 texts:  25%|██▌       | 25/100 [04:57<14:46, 11.82s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.35it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts:  26%|██▌       | 26/100 [05:09<14:39, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.08it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 16 texts:  27%|██▋       | 27/100 [05:21<14:29, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.94it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 16 texts:  28%|██▊       | 28/100 [05:33<14:19, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:  29%|██▉       | 29/100 [05:45<14:04, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.18it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 16 texts:  30%|███       | 30/100 [05:57<13:56, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  31%|███       | 31/100 [06:08<13:40, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.42it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  32%|███▏      | 32/100 [06:20<13:27, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.93it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 16 texts:  33%|███▎      | 33/100 [06:32<13:18, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.99it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:  34%|███▍      | 34/100 [06:44<13:02, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.85it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 16 texts:  35%|███▌      | 35/100 [06:56<12:50, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.64it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 16 texts:  36%|███▌      | 36/100 [07:08<12:39, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.78it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 16 texts:  37%|███▋      | 37/100 [07:19<12:26, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.36it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 16 texts:  38%|███▊      | 38/100 [07:31<12:16, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.41it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 16 texts:  39%|███▉      | 39/100 [07:43<12:05, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.90it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:  40%|████      | 40/100 [07:55<11:51, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.07it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 16 texts:  41%|████      | 41/100 [08:07<11:43, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 16 texts:  42%|████▏     | 42/100 [08:19<11:32, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.33it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 16 texts:  43%|████▎     | 43/100 [08:31<11:18, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts:  44%|████▍     | 44/100 [08:43<11:06, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 16 texts:  45%|████▌     | 45/100 [08:55<10:58, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.73it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 16 texts:  46%|████▌     | 46/100 [09:07<10:45, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.68it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.14it/s][A[A

Layer 16 texts:  47%|████▋     | 47/100 [09:19<10:35, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.73it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts:  48%|████▊     | 48/100 [09:31<10:22, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.45it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.15it/s][A[A

Layer 16 texts:  49%|████▉     | 49/100 [09:43<10:09, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.32it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 16 texts:  50%|█████     | 50/100 [09:55<09:57, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.29it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 16 texts:  51%|█████     | 51/100 [10:07<09:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.54it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 16 texts:  52%|█████▏    | 52/100 [10:19<09:38, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.15it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 16 texts:  53%|█████▎    | 53/100 [10:31<09:25, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.92it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 16 texts:  54%|█████▍    | 54/100 [10:43<09:10, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.08it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 16 texts:  55%|█████▌    | 55/100 [10:55<08:59, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.54it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 16 texts:  56%|█████▌    | 56/100 [11:07<08:46, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 16 texts:  57%|█████▋    | 57/100 [11:19<08:37, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.85it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts:  58%|█████▊    | 58/100 [11:31<08:26, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 16 texts:  59%|█████▉    | 59/100 [11:43<08:15, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  60%|██████    | 60/100 [11:55<08:03, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.37it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 16 texts:  61%|██████    | 61/100 [12:07<07:50, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.15it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 16 texts:  62%|██████▏   | 62/100 [12:19<07:38, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 16 texts:  63%|██████▎   | 63/100 [12:31<07:24, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 16 texts:  64%|██████▍   | 64/100 [12:43<07:10, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.14it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  65%|██████▌   | 65/100 [12:55<06:58, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.76it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 16 texts:  66%|██████▌   | 66/100 [13:07<06:44, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.97it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 16 texts:  67%|██████▋   | 67/100 [13:19<06:32, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.05it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts:  68%|██████▊   | 68/100 [13:31<06:20, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.67it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  69%|██████▉   | 69/100 [13:42<06:07, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.85it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts:  70%|███████   | 70/100 [13:55<05:58, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.66it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  71%|███████   | 71/100 [14:07<05:46, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.43it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:  72%|███████▏  | 72/100 [14:18<05:33, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.52it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts:  73%|███████▎  | 73/100 [14:30<05:22, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.94it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 16 texts:  74%|███████▍  | 74/100 [14:42<05:10, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.68it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 16 texts:  75%|███████▌  | 75/100 [14:54<04:58, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.91it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  76%|███████▌  | 76/100 [15:06<04:46, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.99it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 16 texts:  77%|███████▋  | 77/100 [15:18<04:35, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.83it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:  78%|███████▊  | 78/100 [15:30<04:23, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 16 texts:  79%|███████▉  | 79/100 [15:42<04:10, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.21it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 16 texts:  80%|████████  | 80/100 [15:54<03:58, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.36it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 16 texts:  81%|████████  | 81/100 [16:06<03:46, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.20it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 16 texts:  82%|████████▏ | 82/100 [16:18<03:35, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.48it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  83%|████████▎ | 83/100 [16:30<03:23, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 16 texts:  84%|████████▍ | 84/100 [16:42<03:10, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.05it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 16 texts:  85%|████████▌ | 85/100 [16:54<02:59, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.59it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 16 texts:  86%|████████▌ | 86/100 [17:06<02:49, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.36it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 16 texts:  87%|████████▋ | 87/100 [17:18<02:36, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.03it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 16 texts:  88%|████████▊ | 88/100 [17:30<02:24, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 16 texts:  89%|████████▉ | 89/100 [17:42<02:11, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.37it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  90%|█████████ | 90/100 [17:54<02:00, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:  91%|█████████ | 91/100 [18:06<01:47, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.02it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 16 texts:  92%|█████████▏| 92/100 [18:18<01:35, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 16 texts:  93%|█████████▎| 93/100 [18:30<01:23, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.94it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 16 texts:  94%|█████████▍| 94/100 [18:42<01:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.50it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts:  95%|█████████▌| 95/100 [18:54<00:59, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 16 texts:  96%|█████████▌| 96/100 [19:06<00:47, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.63it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 16 texts:  97%|█████████▋| 97/100 [19:18<00:35, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.34it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 16 texts:  98%|█████████▊| 98/100 [19:30<00:24, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.46it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 16 texts:  99%|█████████▉| 99/100 [19:42<00:11, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.77it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 16 texts: 100%|██████████| 100/100 [19:54<00:00, 11.97s/it][A
Processing layers:  67%|██████▋   | 4/6 [1:20:34<40:10, 1205.20s/it]  

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
   Memory after layer 16: 12.83GB

📊 Processing Layer 20
   📥 Loading SAE Layer 20: layer_20/width_16k/canonical



Layer 20 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.87it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:   1%|          | 1/100 [00:12<19:54, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.46it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 20 texts:   2%|▏         | 2/100 [00:24<19:39, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 54.74it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:   3%|▎         | 3/100 [00:36<19:34, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.33it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 20 texts:   4%|▍         | 4/100 [00:48<19:17, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.52it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 20 texts:   5%|▌         | 5/100 [01:00<18:58, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.67it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:   6%|▌         | 6/100 [01:12<18:49, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.94it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 20 texts:   7%|▋         | 7/100 [01:24<18:37, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.10it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 20 texts:   8%|▊         | 8/100 [01:36<18:21, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 20 texts:   9%|▉         | 9/100 [01:48<18:10, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.00it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:  10%|█         | 10/100 [01:59<17:55, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.10it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  11%|█         | 11/100 [02:11<17:43, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.14it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 20 texts:  12%|█▏        | 12/100 [02:23<17:30, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.23it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  13%|█▎        | 13/100 [02:35<17:13, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.23it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 20 texts:  14%|█▍        | 14/100 [02:47<17:01, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.30it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 20 texts:  15%|█▌        | 15/100 [02:59<16:51, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.79it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 20 texts:  16%|█▌        | 16/100 [03:11<16:36, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.19it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  17%|█▋        | 17/100 [03:23<16:25, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.31it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  18%|█▊        | 18/100 [03:35<16:25, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.25it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  19%|█▉        | 19/100 [03:47<16:13, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.33it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  20%|██        | 20/100 [03:59<15:55, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 20 texts:  21%|██        | 21/100 [04:11<15:43, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.79it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 20 texts:  22%|██▏       | 22/100 [04:23<15:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.84it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:  23%|██▎       | 23/100 [04:35<15:21, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.99it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  24%|██▍       | 24/100 [04:47<15:10, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.67it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  25%|██▌       | 25/100 [04:59<15:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 20 texts:  26%|██▌       | 26/100 [05:11<14:45, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.50it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  27%|██▋       | 27/100 [05:23<14:31, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:  28%|██▊       | 28/100 [05:34<14:16, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.07it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  29%|██▉       | 29/100 [05:46<14:06, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.83it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  30%|███       | 30/100 [05:59<14:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.71it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 20 texts:  31%|███       | 31/100 [06:11<13:51, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.20it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 20 texts:  32%|███▏      | 32/100 [06:22<13:33, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.90it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  33%|███▎      | 33/100 [06:34<13:21, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 20 texts:  34%|███▍      | 34/100 [06:46<13:09, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  35%|███▌      | 35/100 [06:58<12:56, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.18it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 20 texts:  36%|███▌      | 36/100 [07:10<12:42, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.95it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 20 texts:  37%|███▋      | 37/100 [07:22<12:30, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.41it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  38%|███▊      | 38/100 [07:34<12:20, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.92it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  39%|███▉      | 39/100 [07:46<12:10, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.50it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 20 texts:  40%|████      | 40/100 [07:58<11:54, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.45it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  41%|████      | 41/100 [08:10<11:41, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.48it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 20 texts:  42%|████▏     | 42/100 [08:22<11:30, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.13it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 20 texts:  43%|████▎     | 43/100 [08:34<11:20, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.24it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 20 texts:  44%|████▍     | 44/100 [08:46<11:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.56it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 20 texts:  45%|████▌     | 45/100 [08:58<11:02, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.91it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  46%|████▌     | 46/100 [09:10<10:48, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.59it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 20 texts:  47%|████▋     | 47/100 [09:22<10:34, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.06it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 20 texts:  48%|████▊     | 48/100 [09:34<10:22, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.74it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:  49%|████▉     | 49/100 [09:46<10:12, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  50%|█████     | 50/100 [09:58<09:58, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 20 texts:  51%|█████     | 51/100 [10:10<09:49, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.46it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 20 texts:  52%|█████▏    | 52/100 [10:22<09:36, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.80it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 20 texts:  53%|█████▎    | 53/100 [10:34<09:27, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.08it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:  54%|█████▍    | 54/100 [10:46<09:11, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.17it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 20 texts:  55%|█████▌    | 55/100 [10:58<08:58, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.55it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 20 texts:  56%|█████▌    | 56/100 [11:10<08:45, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 20 texts:  57%|█████▋    | 57/100 [11:21<08:30, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.85it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 20 texts:  58%|█████▊    | 58/100 [11:33<08:18, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.53it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  59%|█████▉    | 59/100 [11:45<08:06, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.57it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.32it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 20 texts:  60%|██████    | 60/100 [11:57<07:54, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.70it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:  61%|██████    | 61/100 [12:09<07:44, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.73it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 20 texts:  62%|██████▏   | 62/100 [12:21<07:33, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.28it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  63%|██████▎   | 63/100 [12:33<07:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.75it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 20 texts:  64%|██████▍   | 64/100 [12:45<07:10, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.88it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 20 texts:  65%|██████▌   | 65/100 [12:57<06:57, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.07it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 20 texts:  66%|██████▌   | 66/100 [13:08<06:43, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.96it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  67%|██████▋   | 67/100 [13:20<06:31, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.49it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  68%|██████▊   | 68/100 [13:32<06:20, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  69%|██████▉   | 69/100 [13:44<06:07, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.83it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  70%|███████   | 70/100 [13:56<05:55, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.11it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 20 texts:  71%|███████   | 71/100 [14:08<05:45, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.36it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 20 texts:  72%|███████▏  | 72/100 [14:20<05:35, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.14it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 20 texts:  73%|███████▎  | 73/100 [14:32<05:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.50it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  74%|███████▍  | 74/100 [14:44<05:13, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  75%|███████▌  | 75/100 [14:56<05:01, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.11it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 20 texts:  76%|███████▌  | 76/100 [15:08<04:46, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 20 texts:  77%|███████▋  | 77/100 [15:20<04:33, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.14it/s][A[A

Layer 20 texts:  78%|███████▊  | 78/100 [15:32<04:23, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.11it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  79%|███████▉  | 79/100 [15:44<04:09, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.37it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 20 texts:  80%|████████  | 80/100 [15:56<03:58, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.98it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  81%|████████  | 81/100 [16:08<03:47, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.05it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  82%|████████▏ | 82/100 [16:20<03:35, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.29it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 20 texts:  83%|████████▎ | 83/100 [16:32<03:23, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.76it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 20 texts:  84%|████████▍ | 84/100 [16:44<03:12, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.38it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 20 texts:  85%|████████▌ | 85/100 [16:56<03:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.93it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 20 texts:  86%|████████▌ | 86/100 [17:08<02:47, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.54it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:  87%|████████▋ | 87/100 [17:19<02:35, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.09it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 20 texts:  88%|████████▊ | 88/100 [17:32<02:23, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.23it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:  89%|████████▉ | 89/100 [17:43<02:11, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.13it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.32it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 20 texts:  90%|█████████ | 90/100 [17:55<01:59, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.99it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts:  91%|█████████ | 91/100 [18:08<01:48, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.56it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 20 texts:  92%|█████████▏| 92/100 [18:19<01:35, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 20 texts:  93%|█████████▎| 93/100 [18:32<01:24, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.18it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 20 texts:  94%|█████████▍| 94/100 [18:44<01:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.78it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 20 texts:  95%|█████████▌| 95/100 [18:55<00:59, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.82it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 20 texts:  96%|█████████▌| 96/100 [19:07<00:47, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.77it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 20 texts:  97%|█████████▋| 97/100 [19:19<00:35, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.39it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 20 texts:  98%|█████████▊| 98/100 [19:32<00:24, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.94it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 20 texts:  99%|█████████▉| 99/100 [19:43<00:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.29it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 20 texts: 100%|██████████| 100/100 [19:55<00:00, 11.98s/it][A
Processing layers:  83%|████████▎ | 5/6 [1:40:31<20:02, 1202.19s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
   Memory after layer 20: 12.83GB

📊 Processing Layer 24
   📥 Loading SAE Layer 24: layer_24/width_16k/canonical



Layer 24 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.63it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 24 texts:   1%|          | 1/100 [00:11<19:44, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.07it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 24 texts:   2%|▏         | 2/100 [00:23<19:29, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.13it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:   3%|▎         | 3/100 [00:35<19:08, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.48it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:   4%|▍         | 4/100 [00:47<19:00, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.34it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 24 texts:   5%|▌         | 5/100 [00:59<18:49, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.03it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 24 texts:   6%|▌         | 6/100 [01:11<18:35, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.59it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:   7%|▋         | 7/100 [01:23<18:24, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.04it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 24 texts:   8%|▊         | 8/100 [01:35<18:15, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.84it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 24 texts:   9%|▉         | 9/100 [01:46<17:59, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.25it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 24 texts:  10%|█         | 10/100 [01:58<17:45, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.79it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  11%|█         | 11/100 [02:10<17:37, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.78it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 24 texts:  12%|█▏        | 12/100 [02:22<17:23, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.48it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:  13%|█▎        | 13/100 [02:34<17:12, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 24 texts:  14%|█▍        | 14/100 [02:46<17:01, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.40it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 24 texts:  15%|█▌        | 15/100 [02:58<16:50, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.67it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  16%|█▌        | 16/100 [03:10<16:37, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.16it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 24 texts:  17%|█▋        | 17/100 [03:21<16:24, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 24 texts:  18%|█▊        | 18/100 [03:33<16:12, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.91it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 24 texts:  19%|█▉        | 19/100 [03:45<16:03, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.02it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  20%|██        | 20/100 [03:57<15:48, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.01it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 24 texts:  21%|██        | 21/100 [04:09<15:35, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.88it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  22%|██▏       | 22/100 [04:21<15:26, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.04it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 24 texts:  23%|██▎       | 23/100 [04:33<15:17, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.94it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 24 texts:  24%|██▍       | 24/100 [04:44<15:00, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.74it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 24 texts:  25%|██▌       | 25/100 [04:56<14:52, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.39it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:  26%|██▌       | 26/100 [05:08<14:43, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.04it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 24 texts:  27%|██▋       | 27/100 [05:21<14:34, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.29it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:  28%|██▊       | 28/100 [05:32<14:20, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.32it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 24 texts:  29%|██▉       | 29/100 [05:44<14:05, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.66it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 24 texts:  30%|███       | 30/100 [05:56<13:56, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.56it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 24 texts:  31%|███       | 31/100 [06:08<13:45, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.51it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  32%|███▏      | 32/100 [06:20<13:29, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.03it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  33%|███▎      | 33/100 [06:32<13:18, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.39it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 24 texts:  34%|███▍      | 34/100 [06:44<13:09, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.26it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  35%|███▌      | 35/100 [06:56<12:54, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.60it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 24 texts:  36%|███▌      | 36/100 [07:08<12:42, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.94it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 24 texts:  37%|███▋      | 37/100 [07:20<12:31, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.16it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  38%|███▊      | 38/100 [07:32<12:20, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.68it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  39%|███▉      | 39/100 [07:44<12:14, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.28it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  40%|████      | 40/100 [07:56<12:01, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.35it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:  41%|████      | 41/100 [08:08<11:49, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.65it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  42%|████▏     | 42/100 [08:20<11:34, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.33it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 24 texts:  43%|████▎     | 43/100 [08:32<11:20, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.37it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 24 texts:  44%|████▍     | 44/100 [08:44<11:09, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.92it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 24 texts:  45%|████▌     | 45/100 [08:56<10:56, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.25it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 24 texts:  46%|████▌     | 46/100 [09:07<10:43, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.34it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 24 texts:  47%|████▋     | 47/100 [09:19<10:30, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.19it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 24 texts:  48%|████▊     | 48/100 [09:31<10:21, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.57it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 24 texts:  49%|████▉     | 49/100 [09:44<10:13, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.65it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  50%|█████     | 50/100 [09:56<10:03, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.01it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  51%|█████     | 51/100 [10:08<09:49, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.88it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 24 texts:  52%|█████▏    | 52/100 [10:19<09:33, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.62it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 24 texts:  53%|█████▎    | 53/100 [10:31<09:21, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.23it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 24 texts:  54%|█████▍    | 54/100 [10:43<09:08, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.58it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 24 texts:  55%|█████▌    | 55/100 [10:55<08:53, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.47it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 24 texts:  56%|█████▌    | 56/100 [11:07<08:42, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.40it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 24 texts:  57%|█████▋    | 57/100 [11:19<08:31, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.07it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 24 texts:  58%|█████▊    | 58/100 [11:31<08:17, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.30it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 24 texts:  59%|█████▉    | 59/100 [11:42<08:05, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.15it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 24 texts:  60%|██████    | 60/100 [11:54<07:55, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.17it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  61%|██████    | 61/100 [12:06<07:42, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.83it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 24 texts:  62%|██████▏   | 62/100 [12:18<07:31, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.05it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.31it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 24 texts:  63%|██████▎   | 63/100 [12:30<07:18, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.82it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 24 texts:  64%|██████▍   | 64/100 [12:42<07:06, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.00it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  65%|██████▌   | 65/100 [12:54<06:56, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.08it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 24 texts:  66%|██████▌   | 66/100 [13:06<06:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.00it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 24 texts:  67%|██████▋   | 67/100 [13:18<06:35, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.96it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:  68%|██████▊   | 68/100 [13:30<06:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.15it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:  69%|██████▉   | 69/100 [13:42<06:09, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.01it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 24 texts:  70%|███████   | 70/100 [13:54<05:57, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.17it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 24 texts:  71%|███████   | 71/100 [14:05<05:45, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 53.98it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  72%|███████▏  | 72/100 [14:17<05:33, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.95it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 24 texts:  73%|███████▎  | 73/100 [14:29<05:23, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.70it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  74%|███████▍  | 74/100 [14:41<05:09, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.00it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:  75%|███████▌  | 75/100 [14:53<04:57, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.81it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 24 texts:  76%|███████▌  | 76/100 [15:05<04:47, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.10it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  77%|███████▋  | 77/100 [15:17<04:34, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.84it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 24 texts:  78%|███████▊  | 78/100 [15:29<04:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.72it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s][A[A

Layer 24 texts:  79%|███████▉  | 79/100 [15:41<04:10, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.89it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 24 texts:  80%|████████  | 80/100 [15:53<03:58, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.99it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 24 texts:  81%|████████  | 81/100 [16:05<03:46, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.25it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 24 texts:  82%|████████▏ | 82/100 [16:17<03:34, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.49it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 24 texts:  83%|████████▎ | 83/100 [16:29<03:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.43it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 24 texts:  84%|████████▍ | 84/100 [16:41<03:12, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.51it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 24 texts:  85%|████████▌ | 85/100 [16:53<02:59, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.49it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  86%|████████▌ | 86/100 [17:05<02:47, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.27it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  87%|████████▋ | 87/100 [17:17<02:35, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.32it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 24 texts:  88%|████████▊ | 88/100 [17:29<02:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.95it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 24 texts:  89%|████████▉ | 89/100 [17:41<02:12, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.64it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 24 texts:  90%|█████████ | 90/100 [17:53<02:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.23it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 24 texts:  91%|█████████ | 91/100 [18:05<01:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.46it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 24 texts:  92%|█████████▏| 92/100 [18:17<01:35, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.16it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 24 texts:  93%|█████████▎| 93/100 [18:29<01:23, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.04it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  94%|█████████▍| 94/100 [18:41<01:12, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.37it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 24 texts:  95%|█████████▌| 95/100 [18:53<01:00, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.93it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  96%|█████████▌| 96/100 [19:05<00:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.76it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 24 texts:  97%|█████████▋| 97/100 [19:17<00:35, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.97it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 24 texts:  98%|█████████▊| 98/100 [19:29<00:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 24 texts:  99%|█████████▉| 99/100 [19:41<00:11, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.17it/s]


📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 24 texts: 100%|██████████| 100/100 [19:53<00:00, 11.97s/it][A
Processing layers: 100%|██████████| 6/6 [2:00:25<00:00, 1204.21s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
   Memory after layer 24: 12.83GB

📊 LAYER SWEEP RESULTS:
Most Similar Layer: 4
Most Different Layer: 24
Highest Feature Overlap Layer: 8
Average Similarity Across Layers: 0.638
Average Feature Overlap: 0.028

🔍 INTERPRETATIONS:
Adaptation Magnitude: 🔍 SIGNIFICANT LLM→VLM adaptation - substantial representational reorganization
Adaptation Location: 🎯 Layer 24 shows maximum adaptation
Feature Preservation: 🔗 Layer 8 best preserves LLM features
Adaptation Pattern: 📈 Early layers preserve LLM representations better than late layers
Sae Quality: ❌ SAE reconstructions significantly impact model functionality

📈 Generating Visualizations...





✅ Layer sweep visualization saved to ../figs_tabs/google_gemma_2_2b_google_paligemma2_3b_pt_224_layer_sweep.png
✅ Detailed results saved to ../figs_tabs/google_gemma_2_2b_google_paligemma2_3b_pt_224_results.json
❌ Error during analysis: 'MemoryEfficientSAEAnalyzer' object has no attribute '_analyze_top_features_trends'

💡 Troubleshooting Tips:
   1. Ensure sufficient GPU memory (8GB+ recommended)
   2. Reduce LAYERS list or sample size if out of memory
   3. Check model names are correct and accessible
   4. Install required packages: pip install sae-lens transformers datasets


Traceback (most recent call last):
  File "/tmp/ipykernel_476945/2513033672.py", line 1165, in main
    analyzer.visualize_layer_sweep_results(results, model1_name, model2_name)
  File "/tmp/ipykernel_476945/2513033672.py", line 1047, in visualize_layer_sweep_results
    self._analyze_top_features_trends()
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'MemoryEfficientSAEAnalyzer' object has no attribute '_analyze_top_features_trends'


'\npip install sae-lens transformers torch matplotlib seaborn numpy datasets tqdm\n\n# For CUDA support (recommended):\npip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n'