In [4]:
#!/usr/bin/env python3
"""
Comprehensive SAE-based representation shift analysis with layer sweeping,
real datasets, and patching logic for LLM->VLM adaptation studies.
"""
# Installation requirements:
"""
pip install sae-lens transformers torch matplotlib seaborn numpy datasets tqdm

# For CUDA support (recommended):
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
"""

import torch
import numpy as np
import os
import gc
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM
from sae_lens import SAE
import matplotlib.pyplot as plt
from typing import Dict, Tuple, List, Optional, Union
from dataclasses import dataclass
import seaborn as sns
from datasets import load_dataset
import json
from tqdm import tqdm
import warnings
import random
import json
warnings.filterwarnings("ignore")



In [None]:
"""
Comprehensive SAE-based representation shift analysis with layer sweeping,
real datasets, and patching logic for LLM->VLM adaptation studies.
FIXED: Handles PaliGemma loss computation correctly.
"""

import torch
import numpy as np
import os
import gc
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM
from sae_lens import SAE
import matplotlib.pyplot as plt
from typing import Dict, Tuple, List, Optional, Union
from dataclasses import dataclass
import seaborn as sns
from datasets import load_dataset
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
import torch.nn.functional as F

# Disable gradients globally for memory efficiency
torch.set_grad_enabled(False)

@dataclass
class SAEMetrics:
    """Container for SAE evaluation metrics."""
    reconstruction_loss: float
    l0_sparsity: float
    l1_sparsity: float
    fraction_alive: float
    mean_max_activation: float
    reconstruction_score: float
    model_delta_loss: float 
    rec_loss_topk: float

@dataclass
class RepresentationShift:
    """Container for representation shift metrics."""
    cosine_similarity: float
    l2_distance: float
    feature_overlap: float
    js_divergence: float
    feature_correlation: float

class DatasetLoader:
    """Handles loading and preprocessing of various datasets."""
    
    def __init__(self, device: str = "cuda"):
        self.device = device
    
    def load_cifar100_captions(self, split: str = "train", max_samples: int = 100) -> List[str]:
        """Load CIFAR-100 with generated captions for multimodal analysis."""
        try:
            # CIFAR-100 doesn't have captions by default, so we create descriptive ones
            dataset = load_dataset("cifar100", split=split)
            
            # CIFAR-100 class names
            class_names = [
                'apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle',
                'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel',
                'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock',
                'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur',
                'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster',
                'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion',
                'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse',
                'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear',
                'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine',
                'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose',
                'sea', 'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake',
                'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table',
                'tank', 'telephone', 'television', 'tiger', 'tractor', 'train', 'trout',
                'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm'
            ]
            
            texts = []
            for i, sample in enumerate(dataset):
                if i >= max_samples:
                    break
                class_name = class_names[sample['fine_label']]
                # Generate descriptive captions
                captions = [
                    f"This is a photo of a {class_name}.",
                    f"An image showing a {class_name}.",
                    f"A picture of a {class_name} in natural setting.",
                    f"Visual representation of a {class_name}."
                ]
                texts.extend(captions[:2])  # Take 2 captions per image
            
            print(f"✅ Loaded {len(texts)} CIFAR-100 captions")
            return texts[:max_samples]
            
        except Exception as e:
            print(f"❌ Error loading CIFAR-100: {e}")
            return self._get_fallback_texts()
    
    def load_coco_captions(self, split: str = "validation", max_samples: int = 100) -> List[str]:
        """Load COCO captions dataset."""
        try:
            # Load COCO captions
            dataset = load_dataset("HuggingFaceM4/COCO", split=split)
            
            texts = []
            for i, sample in enumerate(dataset):
                if i >= max_samples:
                    break
                
                # COCO has multiple captions per image
                if 'sentences' in sample and 'raw' in sample['sentences']:
                    for sentence in sample['sentences']['raw'][:2]:  # Take first 2 captions
                        texts.append(sentence)
                elif 'caption' in sample:
                    texts.append(sample['caption'])
            
            print(f"✅ Loaded {len(texts)} COCO captions")
            return texts[:max_samples]
            
        except Exception as e:
            print(f"❌ Error loading COCO: {e}")
            # Try alternative COCO dataset
            try:
                dataset = load_dataset("nielsr/coco-captions", split="validation")
                texts = [sample['caption'] for sample in dataset.select(range(min(max_samples, len(dataset))))]
                print(f"✅ Loaded {len(texts)} COCO captions (alternative)")
                return texts
            except:
                return self._get_fallback_texts()
    
    def load_llava_bench(self, max_samples: int = 100) -> List[str]:
        """Load LLaVA-Bench questions/descriptions."""
        try:
            # LLaVA bench conversations
            dataset = load_dataset("lmms-lab/LLaVA-OneVision-Data", split="dev_mini")
            
            texts = []
            for i, sample in enumerate(dataset):
                if i >= max_samples:
                    break
                
                if 'conversations' in sample:
                    for conv in sample['conversations'][:2]:  # Take first 2 conversations
                        if 'value' in conv:
                            texts.append(conv['value'])
            
            print(f"✅ Loaded {len(texts)} LLaVA-Bench texts")
            return texts[:max_samples]
            
        except Exception as e:
            print(f"❌ Error loading LLaVA-Bench: {e}")
            return self._get_fallback_texts()
    
    def _get_fallback_texts(self) -> List[str]:
        """Fallback texts if datasets fail to load."""
        return [
            "A photo of a red apple on a white background.",
            "The cat is sitting on a wooden chair.",
            "Mountains covered with snow in winter landscape.",
            "A blue car driving on a highway.",
            "Children playing in a park with green grass.",
            "A delicious chocolate cake on a plate.",
            "Ocean waves crashing against rocky shore.",
            "A person reading a book in a library.",
            "Colorful flowers blooming in spring garden.",
            "A dog running happily in the field.",
        ]
    
    def get_mixed_dataset(self, total_samples: int = 150) -> List[str]:
        """Get a mixed dataset from multiple sources."""
        samples_per_source = total_samples // 3
        
        texts = []
        texts.extend(self.load_cifar100_captions(max_samples=samples_per_source))
        texts.extend(self.load_coco_captions(max_samples=samples_per_source))
        texts.extend(self.load_llava_bench(max_samples=samples_per_source))
        
        # Shuffle for good measure
        import random
        random.shuffle(texts)
        
        return texts[:total_samples]

class MemoryEfficientSAEAnalyzer:
    """Memory-efficient SAE analyzer with layer sweeping and patching logic."""
    
    def __init__(self, 
                 model_size: str = "2b",
                 width: str = "16k", 
                 suffix: str = "canonical",
                 device: str = "cuda",
                 output_dir: str = "../figs_tabs"):
        """
        Initialize memory-efficient SAE analyzer.
        
        Args:
            model_size: Model size ("2b" or "9b")
            width: SAE width ("16k", "65k", "262k")
            suffix: SAE variant ("canonical" or specific L0)
            device: Device to use
            output_dir: Directory for saving outputs
        """
        self.device = device if torch.cuda.is_available() else "cpu"
        self.model_size = model_size
        self.width = width
        self.suffix = suffix
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Model cache for memory efficiency
        self.model_cache = {}
        self.sae_cache = {}
        
        print(f"🔧 Initialized SAE Analyzer")
        print(f"   Device: {self.device}")
        print(f"   Model Size: {model_size}")
        print(f"   SAE Width: {width}")
        print(f"   Output Dir: {output_dir}")

    def get_gemmascope_sae(self, layer: int) -> SAE:
        """Load Gemma Scope SAE with caching for memory efficiency."""
        cache_key = f"layer_{layer}"
        
        if cache_key in self.sae_cache:
            return self.sae_cache[cache_key]
        
        release = f"gemma-scope-{self.model_size}-pt-res"
        if self.suffix == "canonical":
            release = f"gemma-scope-{self.model_size}-pt-res-canonical"
            sae_id = f"layer_{layer}/width_{self.width}/canonical"
        else:
            sae_id = f"layer_{layer}/width_{self.width}/{self.suffix}"
        
        print(f"   📥 Loading SAE Layer {layer}: {sae_id}")
        
        try:
            sae = SAE.from_pretrained(release, sae_id).to(self.device)
            sae.eval()
            
            # Cache management - keep only last 2 SAEs to save memory
            if len(self.sae_cache) >= 2:
                oldest_key = list(self.sae_cache.keys())[0]
                del self.sae_cache[oldest_key]
                gc.collect()
            
            self.sae_cache[cache_key] = sae
            return sae
            
        except Exception as e:
            print(f"❌ Error loading SAE layer {layer}: {e}")
            raise

    def get_model(self, model_name: str):
        """Load model with caching and proper device placement."""
        if model_name in self.model_cache:
            return self.model_cache[model_name]
        
        print(f"📥 Loading model: {model_name}")
        
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            
            # Handle different model types
            if "paligemma" in model_name.lower():
                from transformers import PaliGemmaForConditionalGeneration
                model = PaliGemmaForConditionalGeneration.from_pretrained(
                    model_name, 
                    trust_remote_code=True,
                    torch_dtype=torch.float32,  # Use fp16 for memory efficiency
                    device_map=None  # We'll handle device placement manually
                )
                model = model.to(self.device)
                language_model = model.language_model
            else:
                model = AutoModelForCausalLM.from_pretrained(
                    model_name, 
                    trust_remote_code=True,
                    torch_dtype=torch.float32,
                    device_map=None
                )
                model = model.to(self.device)
                language_model = model
            
            language_model.eval()
            
            # Cache management - keep only one model at a time
            if len(self.model_cache) >= 1:
                for cached_name in list(self.model_cache.keys()):
                    del self.model_cache[cached_name]
                gc.collect()
                torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
            self.model_cache[model_name] = (tokenizer, model, language_model)
            return tokenizer, model, language_model
            
        except Exception as e:
            print(f"❌ Error loading model {model_name}: {e}")
            raise

    def extract_activations_with_patching(self, 
                                        model_name: str, 
                                        text: str, 
                                        layer: int,
                                        sae: Optional[SAE] = None) -> Tuple[torch.Tensor, float]:
        """
        Extract activations and compute model delta loss with patching.
        FIXED: Addresses CUDA device-side assert errors with proper tokenization and loss computation.

        Returns:
            Tuple of (activations, model_delta_loss)
        """
        tokenizer, model, language_model = self.get_model(model_name)

        # FIXED: More robust tokenization with proper padding token handling
        # Ensure we have a pad token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id

        # Tokenize with safer parameters
        inputs = tokenizer(
            text, 
            return_tensors="pt", 
            padding="max_length",
            truncation=True,
            max_length=64,
            add_special_tokens=True  # Ensure special tokens are added properly
        )

        # FIXED: Validate token IDs are within vocabulary range
        vocab_size = tokenizer.vocab_size
        input_ids = inputs['input_ids']

        # Check for out-of-bounds token IDs
        if torch.any(input_ids >= vocab_size) or torch.any(input_ids < 0):
            print(f"⚠️  Invalid token IDs detected. Max ID: {input_ids.max()}, Vocab size: {vocab_size}")
            # Clamp invalid IDs to valid range
            input_ids = torch.clamp(input_ids, 0, vocab_size - 1)
            inputs['input_ids'] = input_ids

        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # FIXED: More robust label creation
        def create_labels(input_ids, pad_token_id):
            """Create labels with proper masking for loss computation"""
            labels = input_ids.clone()
            # Mask padding tokens
            labels[labels == pad_token_id] = -100
            # FIXED: Also mask the first token (often BOS) to avoid issues
            if labels.size(1) > 1:
                labels[:, 0] = -100
            return labels

        # Get unpatched model loss (baseline)
        unpatched_loss = 0.0
        with torch.no_grad():
            try:
                if "paligemma" in model_name.lower():
                    # For PaliGemma, we need to handle text-only input differently
                    labels = create_labels(inputs['input_ids'], tokenizer.pad_token_id)

                    # Get outputs from language model
                    unpatched_outputs = language_model(**inputs)

                    # Check if we have logits to compute loss
                    if hasattr(unpatched_outputs, 'logits'):
                        logits = unpatched_outputs.logits

                        # FIXED: More robust loss computation with better shape handling
                        if logits.size(1) > 1 and labels.size(1) > 1:
                            shift_logits = logits[..., :-1, :].contiguous()
                            shift_labels = labels[..., 1:].contiguous()

                            # Ensure we have valid data for loss computation
                            valid_mask = shift_labels != -100
                            if valid_mask.any():
                                shift_logits = shift_logits.view(-1, shift_logits.size(-1))
                                shift_labels = shift_labels.view(-1)

                                # FIXED: Use reduction='mean' and handle empty tensors
                                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
                                unpatched_loss = loss_fct(shift_logits, shift_labels).item()
                            else:
                                print("⚠️  No valid tokens for loss computation")
                                unpatched_loss = 0.0
                        else:
                            print("⚠️  Insufficient sequence length for loss computation")
                            unpatched_loss = 0.0
                    else:
                        # Fallback for models without logits
                        unpatched_loss = 0.0
                        print(f"⚠️  No logits available for {model_name}, using zero loss")

                else:
                    # For regular language models
                    labels = create_labels(inputs['input_ids'], tokenizer.pad_token_id)
                    unpatched_outputs = language_model(**inputs, labels=labels)

                    if hasattr(unpatched_outputs, 'loss') and unpatched_outputs.loss is not None:
                        unpatched_loss = unpatched_outputs.loss.item()
                    else:
                        # FIXED: Same robust loss computation as above
                        if hasattr(unpatched_outputs, 'logits'):
                            logits = unpatched_outputs.logits

                            if logits.size(1) > 1 and labels.size(1) > 1:
                                shift_logits = logits[..., :-1, :].contiguous()
                                shift_labels = labels[..., 1:].contiguous()

                                valid_mask = shift_labels != -100
                                if valid_mask.any():
                                    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
                                    shift_labels = shift_labels.view(-1)

                                    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
                                    unpatched_loss = loss_fct(shift_logits, shift_labels).item()
                                else:
                                    unpatched_loss = 0.0
                            else:
                                unpatched_loss = 0.0
                        else:
                            unpatched_loss = 0.0

            except Exception as e:
                print(f"⚠️  Error computing unpatched loss: {e}")
                unpatched_loss = 0.0

        # Extract activations from target layer
        activations = None
        patched_loss = unpatched_loss  # Default if no patching

        def activation_hook(module, input, output):
            nonlocal activations
            try:
                if isinstance(output, tuple):
                    activations = output[0].clone().detach()
                else:
                    activations = output.clone().detach()
            except Exception as e:
                print(f"⚠️  Error in activation hook: {e}")

        # FIXED: More robust layer identification
        target_layer = None
        try:
            if hasattr(language_model, 'model') and hasattr(language_model.model, 'layers'):
                if layer < len(language_model.model.layers):
                    target_layer = language_model.model.layers[layer]
                else:
                    print(f"❌ Layer {layer} out of range. Model has {len(language_model.model.layers)} layers")
                    return torch.randn(1, 64, 2304).to(self.device), 0.0
            elif hasattr(language_model, 'layers'):
                if layer < len(language_model.layers):
                    target_layer = language_model.layers[layer]
                else:
                    print(f"❌ Layer {layer} out of range. Model has {len(language_model.layers)} layers")
                    return torch.randn(1, 64, 2304).to(self.device), 0.0
            else:
                print(f"❌ Could not find layers in model structure")
                return torch.randn(1, 64, 2304).to(self.device), 0.0
        except Exception as e:
            print(f"❌ Error accessing layer {layer}: {e}")
            return torch.randn(1, 64, 2304).to(self.device), 0.0

        if target_layer is None:
            print(f"❌ Could not find layer {layer}")
            return torch.randn(1, 64, 2304).to(self.device), 0.0

        hook = target_layer.register_forward_hook(activation_hook)

        # Forward pass to get activations
        with torch.no_grad():
            try:
                if "paligemma" in model_name.lower():
                    _ = language_model(**inputs)
                else:
                    _ = language_model(**inputs)
            except Exception as e:
                print(f"⚠️  Error in activation extraction: {e}")

        hook.remove()

        # Compute patched loss if SAE is provided
        if sae is not None and activations is not None:
            patched_loss = self._compute_patched_loss(
                language_model, inputs, activations, sae, layer, model_name, tokenizer
            )

        model_delta_loss = patched_loss - unpatched_loss

        if activations is None:
            print(f"⚠️  Failed to extract activations from layer {layer}")
            # FIXED: Return appropriate tensor size based on model
            try:
                # Try to get the actual hidden size from the model config
                if hasattr(language_model, 'config') and hasattr(language_model.config, 'hidden_size'):
                    hidden_size = language_model.config.hidden_size
                else:
                    hidden_size = 2304  # fallback
                activations = torch.randn(1, 64, hidden_size).to(self.device)
            except:
                activations = torch.randn(1, 64, 2304).to(self.device)

        return activations, model_delta_loss

    def _compute_patched_loss(self, 
                            language_model, 
                            inputs: Dict, 
                            original_activations: torch.Tensor, 
                            sae: SAE, 
                            layer: int,
                            model_name: str,
                            tokenizer) -> float:
        """Compute loss with SAE-patched activations. FIXED: Robust error handling and loss computation."""
        try:
            # Get SAE reconstruction
            flat_activations = original_activations.view(-1, original_activations.size(-1))
            print(f"Activations shape: {flat_activations.shape}")

            sae_output = sae(flat_activations)

            # Handle different SAE output formats
            if hasattr(sae_output, 'sae_out'):
                reconstructed = sae_output.sae_out
            elif isinstance(sae_output, tuple):
                reconstructed = sae_output[0]
            else:
                reconstructed = sae_output

            # Reshape back to original shape
            reconstructed = reconstructed.view(original_activations.shape)

            # Patch the reconstructed activations back into the model
            patched_activations = reconstructed.detach()  # FIXED: Ensure no gradients

            # Create a patching hook
            def patching_hook(module, input, output):
                try:
                    if isinstance(output, tuple):
                        return (patched_activations, *output[1:])
                    else:
                        return patched_activations
                except Exception as e:
                    print(f"⚠️  Error in patching hook: {e}")
                    return output  # Return original if patching fails

            # Hook the target layer for patching
            target_layer = None
            if hasattr(language_model, 'model') and hasattr(language_model.model, 'layers'):
                if layer < len(language_model.model.layers):
                    target_layer = language_model.model.layers[layer]
            elif hasattr(language_model, 'layers'):
                if layer < len(language_model.layers):
                    target_layer = language_model.layers[layer]

            if target_layer is None:
                return 0.0

            patch_hook = target_layer.register_forward_hook(patching_hook)

            # FIXED: Use the same robust label creation as in main function
            def create_labels(input_ids, pad_token_id):
                labels = input_ids.clone()
                labels[labels == pad_token_id] = -100
                if labels.size(1) > 1:
                    labels[:, 0] = -100
                return labels

            # Forward pass with patched activations
            patched_loss = 0.0
            with torch.no_grad():
                if "paligemma" in model_name.lower():
                    labels = create_labels(inputs['input_ids'], tokenizer.pad_token_id)
                    patched_outputs = language_model(**inputs)

                    if hasattr(patched_outputs, 'logits'):
                        logits = patched_outputs.logits

                        if logits.size(1) > 1 and labels.size(1) > 1:
                            shift_logits = logits[..., :-1, :].contiguous()
                            shift_labels = labels[..., 1:].contiguous()

                            valid_mask = shift_labels != -100
                            if valid_mask.any():
                                shift_logits = shift_logits.view(-1, shift_logits.size(-1))
                                shift_labels = shift_labels.view(-1)

                                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
                                patched_loss = loss_fct(shift_logits, shift_labels).item()
                    else:
                        patched_loss = 0.0
                else:
                    labels = create_labels(inputs['input_ids'], tokenizer.pad_token_id)
                    patched_outputs = language_model(**inputs, labels=labels)

                    if hasattr(patched_outputs, 'loss') and patched_outputs.loss is not None:
                        patched_loss = patched_outputs.loss.item()
                    else:
                        if hasattr(patched_outputs, 'logits'):
                            logits = patched_outputs.logits

                            if logits.size(1) > 1 and labels.size(1) > 1:
                                shift_logits = logits[..., :-1, :].contiguous()
                                shift_labels = labels[..., 1:].contiguous()

                                valid_mask = shift_labels != -100
                                if valid_mask.any():
                                    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
                                    shift_labels = shift_labels.view(-1)

                                    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='mean')
                                    patched_loss = loss_fct(shift_logits, shift_labels).item()

            patch_hook.remove()
            return patched_loss

        except Exception as e:
            print(f"⚠️  Patching failed: {e}")
            return 0.0

    def compute_sae_metrics(self, activations: torch.Tensor, sae: SAE, model_delta_loss: float) -> SAEMetrics:
        """Compute comprehensive SAE evaluation metrics including model delta loss and top-20 features."""
        with torch.no_grad():
            # Reshape activations for SAE processing
            batch_size, seq_len, d_model = activations.shape
            flat_activations = activations.view(-1, d_model)
            
            # Forward pass through SAE
            sae_output = sae(flat_activations) 
            
            # Handle different SAE output formats
            if hasattr(sae_output, 'feature_acts'):
                feature_acts = sae_output.feature_acts # shape (batch_size * seq_len,  latent_dim)
                reconstructed = sae_output.sae_out
            elif isinstance(sae_output, tuple) and len(sae_output) >= 2:
                reconstructed, feature_acts = sae_output[0], sae_output[1]
            elif hasattr(sae, 'encode') and hasattr(sae, 'decode'):
                feature_acts = sae.encode(flat_activations)
                reconstructed = sae.decode(feature_acts)
            else:
                reconstructed = sae_output
                if hasattr(sae, 'W_enc') and hasattr(sae, 'b_enc'):
                    feature_acts = torch.relu(flat_activations @ sae.W_enc + sae.b_enc)
                else:
                    print(f"Failed retrieving SAE reconstructions, random intialisign...")
                    feature_acts = torch.randn(flat_activations.shape[0], 16384, device=flat_activations.device)
            
            # 1. Reconstruction Loss (MSE)
            reconstruction_loss = torch.nn.functional.mse_loss(reconstructed, flat_activations).item()
            
            # 2. L0 Sparsity (fraction of non-zero features)
            l0_sparsity = (feature_acts > 0).float().mean().item()
            
            # 3. L1 Sparsity (mean absolute activation)
            l1_sparsity = feature_acts.abs().mean().item()
            
            # 4. Fraction of features that are ever active
            fraction_alive = (feature_acts.max(dim=0)[0] > 0).float().mean().item()
            
            # 5. Mean maximum activation per sample
            mean_max_activation = feature_acts.max(dim=1)[0].mean().item()
            
            # 6. Reconstruction score (explained variance)
            var_original = flat_activations.var(dim=0).mean()
            var_residual = (flat_activations - reconstructed).var(dim=0).mean()
            reconstruction_score = max(0.0, 1 - (var_residual / var_original).item())
            
            # Store top-20 features for analysis
            mean_feature_acts = feature_acts.mean(dim=0)  # Average across all tokens/samples
            top_20_indices = torch.topk(mean_feature_acts, k=min(20, feature_acts.size(-1)))[1]
            self._store_top_features(top_20_indices, mean_feature_acts, 
                                   reconstruction_loss, l0_sparsity, model_delta_loss)
            
            # top-20 rec loss
            top_acts = feature_acts[..., top_20_indices] # shape (batch_size * seq_len,  latent_dim)
            if hasattr(sae, 'decode'):
                latent_dim = feature_acts.size(-1)  # e.g., 16384
                z_sparse = torch.zeros(feature_acts.size(0), latent_dim,
                                       device=feature_acts.device, dtype=feature_acts.dtype)
                z_sparse[:, top_20_indices] = top_acts  # place the 20 activations at their true indices
                recon_from_topk = sae.decode(z_sparse)  # ✅ correct shape
#                 recon_from_topk = sae.decode( top_acts )  # if your SAE supports that
            else:
                if hasattr(sae, 'W_dec') and hasattr(sae, 'b_dec'):
                    # Select the relevant columns from W_dec for the top-20 features
                    W_dec_topk = sae.W_dec[:, top_20_indices]  # Select columns corresponding to top-20 activations

                    # If necessary, apply a bias term (assuming b_dec is shared across all features)
                    b_dec_topk = sae.b_dec  # Bias term stays the same for all activations
                    print(top_acts.shape, W_dec_topk.shape, b_dec_topk.shape)
                    # Reconstruct the activations from the top-20 features
                    recon_from_topk = torch.relu(top_acts @ W_dec_topk + b_dec_topk)
            
            rec_loss_topk = F.mse_loss(recon_from_topk, flat_activations).item()
            
            return SAEMetrics(
                reconstruction_loss=reconstruction_loss,
                l0_sparsity=l0_sparsity,
                l1_sparsity=l1_sparsity,
                fraction_alive=fraction_alive,
                mean_max_activation=mean_max_activation,
                reconstruction_score=reconstruction_score,
                model_delta_loss=model_delta_loss,
                rec_loss_topk=rec_loss_topk
            )
    
    def _store_top_features(self, top_indices: torch.Tensor, feature_acts: torch.Tensor, 
                           recon_loss: float, sparsity: float, delta_loss: float):
        """Store top-20 activated features for analysis."""
        if not hasattr(self, 'top_features_log'):
            self.top_features_log = []
        
        top_features_info = {
            'top_20_indices': top_indices.cpu().tolist(),
            'top_20_activations': feature_acts[top_indices].cpu().tolist(),
            'reconstruction_loss': recon_loss,
            'sparsity': sparsity,
            'delta_loss': delta_loss,
            'timestamp': len(self.top_features_log)  # Simple counter
        }
        
        self.top_features_log.append(top_features_info)

    def analyze_layer_sweep(self, 
                           model1_name: str, 
                           model2_name: str, 
                           texts: List[str],
                           layers: List[int] = None) -> Dict:
        """
        Perform memory-efficient layer sweep analysis.
        
        Args:
            model1_name: First model (base LLM)
            model2_name: Second model (VLM) 
            texts: List of texts to analyze
            layers: List of layers to analyze (default: [8, 12, 16, 20])
        """
        if layers is None:
            layers = [8, 12, 16, 20]  # Sample layers across the model
        
        print(f"🚀 Starting Layer Sweep Analysis")
        print(f"   Model 1: {model1_name}")
        print(f"   Model 2: {model2_name}")
        print(f"   Layers: {layers}")
        print(f"   Texts: {len(texts)} samples")
        print(f"   Memory: {torch.cuda.memory_allocated() / 1e9:.2f}GB" if torch.cuda.is_available() else "")
        
        results = {
            'layers': layers,
            'layer_results': {},
            'texts': texts[:10]  # Store subset for reference
        }
        
        for layer in tqdm(layers, desc="Processing layers"):
            print(f"\n📊 Processing Layer {layer}")
            
            # Load SAE for this layer
            sae = self.get_gemmascope_sae(layer)
            
            layer_metrics = {
                'model1_metrics': [],
                'model2_metrics': [],
                'shift_metrics': []
            }
            
            # Process subset of texts for each layer (memory efficiency)
            sample_texts = texts[:100]  # Process 100 texts per layer (increased from 20)
            
            for i, text in enumerate(tqdm(sample_texts, desc=f"Layer {layer} texts", leave=False)):
                try:
                    # Extract activations and compute metrics for model 1
                    acts1, delta_loss1 = self.extract_activations_with_patching(
                        model1_name, text, layer, sae
                    )
                    metrics1 = self.compute_sae_metrics(acts1, sae, delta_loss1)
                    
                    # Extract activations and compute metrics for model 2
                    acts2, delta_loss2 = self.extract_activations_with_patching(
                        model2_name, text, layer, sae
                    )
                    metrics2 = self.compute_sae_metrics(acts2, sae, delta_loss2)
                    
                    # Compute representation shift
                    shift = self.compute_representation_shift(acts1, acts2, sae)
                    
                    layer_metrics['model1_metrics'].append(metrics1)
                    layer_metrics['model2_metrics'].append(metrics2)
                    layer_metrics['shift_metrics'].append(shift)
                    
                except Exception as e:
                    print(f"⚠️  Error processing text {i} in layer {layer}: {e}")
                    continue
            
            # Compute layer-level aggregates
            layer_metrics['aggregate'] = self._compute_layer_aggregate(layer_metrics)
            results['layer_results'][layer] = layer_metrics
            
            # Memory cleanup
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                print(f"   Memory after layer {layer}: {torch.cuda.memory_allocated() / 1e9:.2f}GB")
        
        # Compute overall analysis
        results['overall_analysis'] = self._compute_overall_analysis(results)
        
        return results

    def compute_representation_shift(self, 
                                   activations1: torch.Tensor, 
                                   activations2: torch.Tensor,
                                   sae: SAE) -> RepresentationShift:
        """Compute representation shift metrics using SAE features."""
        with torch.no_grad():
            # Process both activation sets through SAE
            flat_acts1 = activations1.view(-1, activations1.size(-1))
            flat_acts2 = activations2.view(-1, activations2.size(-1))
            
            # Get SAE features
            def extract_features(flat_acts):
                sae_output = sae(flat_acts)
                if hasattr(sae_output, 'feature_acts'):
                    return sae_output.feature_acts
                elif isinstance(sae_output, tuple) and len(sae_output) >= 2:
                    return sae_output[1]
                elif hasattr(sae, 'encode'):
                    return sae.encode(flat_acts)
                else:
                    if hasattr(sae, 'W_enc') and hasattr(sae, 'b_enc'):
                        print(f"flat_acts: {flat_acts.shape}, sae.W_enc: {sae.W_enc.shape}, sae.b_enc: {sae.b_enc.shape}")
                        return torch.relu(flat_acts @ sae.W_enc + sae.b_enc)
                    else:
                        return torch.randn(flat_acts.shape[0], 16384, device=flat_acts.device)
            
            features1 = extract_features(flat_acts1)
            features2 = extract_features(flat_acts2)
            
            # 1. Cosine similarity
            cosine_sim = torch.nn.functional.cosine_similarity(
                features1.mean(dim=0), features2.mean(dim=0), dim=0
            ).item()
            
            # 2. L2 distance
            l2_distance = torch.norm(features1.mean(dim=0) - features2.mean(dim=0), p=2).item()
            
            # 3. Feature overlap (Jaccard similarity)
            active1 = (features1 > 0).float()
            active2 = (features2 > 0).float()
            intersection = (active1 * active2).sum(dim=0)
            union = torch.clamp(active1.sum(dim=0) + active2.sum(dim=0) - intersection, min=1)
            feature_overlap = (intersection / union).mean().item()
            
            # 4. Jensen-Shannon divergence
            def js_divergence(p, q):
                p = p + 1e-8
                q = q + 1e-8
                p = p / p.sum()
                q = q / q.sum()
                m = 0.5 * (p + q)
                return 0.5 * (torch.nn.functional.kl_div(p.log(), m, reduction='sum') + 
                             torch.nn.functional.kl_div(q.log(), m, reduction='sum'))
            
            p = features1.mean(dim=0).abs()
            q = features2.mean(dim=0).abs()
            js_div = js_divergence(p, q).item()
            
            # 5. Feature correlation
            try:
                corr_matrix = torch.corrcoef(torch.stack([
                    features1.mean(dim=0), features2.mean(dim=0)
                ]))
                feature_correlation = corr_matrix[0, 1].item() if not torch.isnan(corr_matrix[0, 1]) else 0.0
            except:
                feature_correlation = 0.0
            
            return RepresentationShift(
                cosine_similarity=cosine_sim,
                l2_distance=l2_distance,
                feature_overlap=feature_overlap,
                js_divergence=js_div,
                feature_correlation=feature_correlation
            )

    def _compute_layer_aggregate(self, layer_metrics: Dict) -> Dict:
        """Compute aggregate statistics for a single layer."""
        n_samples = len(layer_metrics['model1_metrics'])
        if n_samples == 0:
            return {}
        
        # Average metrics across samples
        avg_model1 = {}
        avg_model2 = {}
        avg_shift = {}
        
        for field in SAEMetrics.__dataclass_fields__:
            avg_model1[field] = np.mean([getattr(m, field) for m in layer_metrics['model1_metrics']])
            avg_model2[field] = np.mean([getattr(m, field) for m in layer_metrics['model2_metrics']])
        
        for field in RepresentationShift.__dataclass_fields__:
            avg_shift[field] = np.mean([getattr(s, field) for s in layer_metrics['shift_metrics']])
        
        return {
            'avg_model1_metrics': avg_model1,
            'avg_model2_metrics': avg_model2,
            'avg_shift_metrics': avg_shift,
            'n_samples': n_samples
        }

    def _compute_overall_analysis(self, results: Dict) -> Dict:
        """Compute overall analysis across all layers."""
        layers = results['layers']
        
        # Collect metrics across layers
        layer_similarities = []
        layer_overlaps = []
        layer_delta_losses = []
        layer_sparsities = []
        layer_rec_loss_topk = []
        
        for layer in layers:
            if layer in results['layer_results'] and 'aggregate' in results['layer_results'][layer]:
                agg = results['layer_results'][layer]['aggregate']
                if agg:  # Check if aggregate is not empty
                    layer_similarities.append(agg['avg_shift_metrics']['cosine_similarity'])
                    layer_overlaps.append(agg['avg_shift_metrics']['feature_overlap'])
                    layer_delta_losses.append(abs(agg['avg_model1_metrics']['model_delta_loss'] - 
                                                 agg['avg_model2_metrics']['model_delta_loss']))
                    layer_sparsities.append((agg['avg_model1_metrics']['l0_sparsity'] + 
                                           agg['avg_model2_metrics']['l0_sparsity']) / 2)

        
        # Overall insights
        overall = {
            'most_similar_layer': layers[np.argmax(layer_similarities)] if layer_similarities else None,
            'most_different_layer': layers[np.argmin(layer_similarities)] if layer_similarities else None,
            'highest_overlap_layer': layers[np.argmax(layer_overlaps)] if layer_overlaps else None,
            'highest_delta_loss_layer': layers[np.argmax(layer_delta_losses)] if layer_delta_losses else None,
            'avg_similarity_across_layers': np.mean(layer_similarities) if layer_similarities else 0,
            'avg_overlap_across_layers': np.mean(layer_overlaps) if layer_overlaps else 0,
            'avg_delta_loss_across_layers': np.mean(layer_delta_losses) if layer_delta_losses else 0,
            'layer_similarities': dict(zip(layers, layer_similarities)),
            'layer_overlaps': dict(zip(layers, layer_overlaps))
        }
        
        return overall

    def visualize_layer_sweep_results(self, results: Dict, model1_name: str, model2_name: str):
        """Create comprehensive visualization of layer sweep results."""
        layers = results['layers']
        
        # Create output filename
        model1_clean = model1_name.replace('/', '_').replace('-', '_')
        model2_clean = model2_name.replace('/', '_').replace('-', '_')
        save_path = self.output_dir / f"{model1_clean}_{model2_clean}_layer_sweep.png"
        
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        fig.suptitle(f'SAE Layer Sweep Analysis: {model1_name} vs {model2_name}', fontsize=16)
        
        # Collect data across layers
        layer_data = {
            'similarities': [],
            'overlaps': [],
            'recon_losses_m1': [],
            'recon_losses_m2': [],
            'sparsities_m1': [],
            'sparsities_m2': [],
            'delta_losses_m1': [],
            'delta_losses_m2': []
        }
        
        for layer in layers:
            if layer in results['layer_results'] and 'aggregate' in results['layer_results'][layer]:
                agg = results['layer_results'][layer]['aggregate']
                if agg:
                    layer_data['similarities'].append(agg['avg_shift_metrics']['cosine_similarity'])
                    layer_data['overlaps'].append(agg['avg_shift_metrics']['feature_overlap'])
                    layer_data['recon_losses_m1'].append(agg['avg_model1_metrics']['reconstruction_loss'])
                    layer_data['recon_losses_m2'].append(agg['avg_model2_metrics']['reconstruction_loss'])
                    layer_data['sparsities_m1'].append(agg['avg_model1_metrics']['l0_sparsity'])
                    layer_data['sparsities_m2'].append(agg['avg_model2_metrics']['l0_sparsity'])
                    layer_data['delta_losses_m1'].append(agg['avg_model1_metrics']['model_delta_loss'])
                    layer_data['delta_losses_m2'].append(agg['avg_model2_metrics']['model_delta_loss'])
        
        # Plot 1: Representation Similarity Across Layers
        axes[0, 0].plot(layers, layer_data['similarities'], 'o-', linewidth=2, markersize=8)
        axes[0, 0].set_title('Cosine Similarity Across Layers')
        axes[0, 0].set_xlabel('Layer')
        axes[0, 0].set_ylabel('Cosine Similarity')
        axes[0, 0].grid(True, alpha=0.3)
        axes[0, 0].axhline(y=0.8, color='red', linestyle='--', alpha=0.5, label='High Similarity')
        axes[0, 0].legend()
        
        # Plot 2: Feature Overlap Across Layers
        axes[0, 1].plot(layers, layer_data['overlaps'], 'o-', color='green', linewidth=2, markersize=8)
        axes[0, 1].set_title('Feature Overlap Across Layers')
        axes[0, 1].set_xlabel('Layer')
        axes[0, 1].set_ylabel('Feature Overlap')
        axes[0, 1].grid(True, alpha=0.3)
        axes[0, 1].axhline(y=0.5, color='red', linestyle='--', alpha=0.5, label='Moderate Overlap')
        axes[0, 1].legend()
        
        # Plot 3: Reconstruction Loss Comparison
        axes[0, 2].plot(layers, layer_data['recon_losses_m1'], 'o-', label='Model 1 (LLM)', linewidth=2)
        axes[0, 2].plot(layers, layer_data['recon_losses_m2'], 's-', label='Model 2 (VLM)', linewidth=2)
        axes[0, 2].set_title('Reconstruction Loss Across Layers')
        axes[0, 2].set_xlabel('Layer')
        axes[0, 2].set_ylabel('Reconstruction Loss')
        axes[0, 2].legend()
        axes[0, 2].grid(True, alpha=0.3)
        
        # Plot 4: Sparsity Comparison
        axes[1, 0].plot(layers, layer_data['sparsities_m1'], 'o-', label='Model 1 (LLM)', linewidth=2)
        axes[1, 0].plot(layers, layer_data['sparsities_m2'], 's-', label='Model 2 (VLM)', linewidth=2)
        axes[1, 0].set_title('L0 Sparsity Across Layers')
        axes[1, 0].set_xlabel('Layer')
        axes[1, 0].set_ylabel('L0 Sparsity')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # Plot 5: Model Delta Loss (Patching Performance)
        axes[1, 1].plot(layers, layer_data['delta_losses_m1'], 'o-', label='Model 1 (LLM)', linewidth=2)
        axes[1, 1].plot(layers, layer_data['delta_losses_m2'], 's-', label='Model 2 (VLM)', linewidth=2)
        axes[1, 1].set_title('Model Delta Loss (Patching Quality)')
        axes[1, 1].set_xlabel('Layer')
        axes[1, 1].set_ylabel('Delta Loss')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        axes[1, 1].axhline(y=0, color='black', linestyle='-', alpha=0.3)
        
        # Plot 6: Summary Heatmap
        # Create a summary matrix for visualization
        metrics_matrix = np.array([
            layer_data['similarities'],
            layer_data['overlaps'],
            np.array(layer_data['recon_losses_m1']) / max(max(layer_data['recon_losses_m1']), 1e-6),  # Normalize
            np.array(layer_data['sparsities_m1']) * 10,  # Scale up for visibility
        ])
        
        im = axes[1, 2].imshow(metrics_matrix, cmap='RdYlBu_r', aspect='auto')
        axes[1, 2].set_title('Metrics Heatmap Across Layers')
        axes[1, 2].set_xlabel('Layer Index')
        axes[1, 2].set_yticks(range(4))
        axes[1, 2].set_yticklabels(['Similarity', 'Overlap', 'Recon Loss (norm)', 'Sparsity (x10)'])
        axes[1, 2].set_xticks(range(len(layers)))
        axes[1, 2].set_xticklabels([f'L{l}' for l in layers])
        
        # Add colorbar
        cbar = plt.colorbar(im, ax=axes[1, 2])
        cbar.set_label('Metric Value')
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✅ Layer sweep visualization saved to {save_path}")
        
        # Save detailed results as JSON including top features
        json_path = self.output_dir / f"{model1_clean}_{model2_clean}_results.json"
        
        # Convert results to JSON-serializable format
        json_results = {
            'layers': layers,
            'overall_analysis': results['overall_analysis'],
            'layer_summaries': {},
            'top_features_analysis': getattr(self, 'top_features_log', [])
        }
        
        for layer in layers:
            if layer in results['layer_results'] and 'aggregate' in results['layer_results'][layer]:
                agg = results['layer_results'][layer]['aggregate']
                if agg:
                    json_results['layer_summaries'][str(layer)] = agg
        
        with open(json_path, 'w') as f:
            json.dump(json_results, f, indent=2)
        print(f"✅ Detailed results saved to {json_path}")
        
        # Create top features analysis
        self._analyze_top_features_trends()

    def interpret_layer_sweep_results(self, results: Dict) -> Dict[str, str]:
        """Provide interpretation of layer sweep results."""
        overall = results['overall_analysis']
        interpretations = {}
        
        # Overall adaptation assessment
        avg_similarity = overall['avg_similarity_across_layers']
        if avg_similarity > 0.85:
            interpretations['adaptation_magnitude'] = "✅ MINIMAL LLM→VLM adaptation - representations largely preserved"
        elif avg_similarity > 0.7:
            interpretations['adaptation_magnitude'] = "⚠️ MODERATE LLM→VLM adaptation - selective representational changes"
        else:
            interpretations['adaptation_magnitude'] = "🔍 SIGNIFICANT LLM→VLM adaptation - substantial representational reorganization"
        
        # Layer-specific insights
        if overall['most_different_layer'] is not None:
            interpretations['adaptation_location'] = f"🎯 Layer {overall['most_different_layer']} shows maximum adaptation"
        
        if overall['highest_overlap_layer'] is not None:
            interpretations['feature_preservation'] = f"🔗 Layer {overall['highest_overlap_layer']} best preserves LLM features"
        
        # Adaptation pattern
        layer_sims = list(overall['layer_similarities'].values())
        if len(layer_sims) >= 3:
            early_sim = np.mean(layer_sims[:len(layer_sims)//3])
            late_sim = np.mean(layer_sims[-len(layer_sims)//3:])
            
            if early_sim > late_sim + 0.1:
                interpretations['adaptation_pattern'] = "📈 Early layers preserve LLM representations better than late layers"
            elif late_sim > early_sim + 0.1:
                interpretations['adaptation_pattern'] = "📉 Late layers preserve LLM representations better than early layers"
            else:
                interpretations['adaptation_pattern'] = "📊 Uniform adaptation pattern across layers"
        
        # SAE quality assessment
        avg_delta_loss = overall['avg_delta_loss_across_layers']
        if avg_delta_loss < 0.1:
            interpretations['sae_quality'] = "✅ SAE reconstructions preserve model functionality well"
        elif avg_delta_loss < 0.5:
            interpretations['sae_quality'] = "⚠️ SAE reconstructions cause moderate functional degradation"
        else:
            interpretations['sae_quality'] = "❌ SAE reconstructions significantly impact model functionality"
        
        return interpretations


print("🚀 Comprehensive SAE Layer Sweep Analysis: LLM→VLM Adaptation")
print("=" * 70)

# Configuration
MODEL_SIZE = "2b"
WIDTH = "16k"
SUFFIX = "canonical"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#     LAYERS = [4, 8, 12, 16, 20, 24]  # Sample across the model depth
LAYERS = list(range(25))
try:
    # Initialize analyzer
    analyzer = MemoryEfficientSAEAnalyzer(
        model_size=MODEL_SIZE,
        width=WIDTH,
        suffix=SUFFIX,
        device=DEVICE
    )

    # Load dataset
    print("\n📚 Loading Datasets...")
    dataset_loader = DatasetLoader(device=DEVICE)
    texts = dataset_loader.get_mixed_dataset(total_samples=1000)  # Use 1K data as requested

    print(f"✅ Loaded {len(texts)} texts from mixed datasets")
    print(f"Sample texts: {texts[:3]}")

    # Model configuration for LLM->VLM comparison
    model1_name = "google/gemma-2-2b"  # Base Gemma-2-2B (LLM)
    model2_name = "google/paligemma2-3b-pt-224"  # PaliGemma with Gemma-2-2B decoder (VLM)

    print(f"\n🔬 Research Configuration:")
    print(f"   Model 1 (LLM): {model1_name}")
    print(f"   Model 2 (VLM): {model2_name}")
    print(f"   Layers to analyze: {LAYERS}")
    print(f"   SAE Configuration: {MODEL_SIZE}-{WIDTH}-{SUFFIX}")
    print(f"   Device: {DEVICE}")
    print(f"   Total texts: {len(texts)}")

    # Run layer sweep analysis
    print(f"\n🚀 Starting Layer Sweep Analysis...")
    results = analyzer.analyze_layer_sweep(
        model1_name=model1_name,
        model2_name=model2_name,
        texts=texts,
        layers=LAYERS
    )

    # Generate interpretations
    interpretations = analyzer.interpret_layer_sweep_results(results)

    print(f"\n📊 LAYER SWEEP RESULTS:")
    print("=" * 50)

    overall = results['overall_analysis']
    print(f"Most Similar Layer: {overall['most_similar_layer']}")
    print(f"Most Different Layer: {overall['most_different_layer']}")
    print(f"Highest Feature Overlap Layer: {overall['highest_overlap_layer']}")
    print(f"Average Similarity Across Layers: {overall['avg_similarity_across_layers']:.3f}")
    print(f"Average Feature Overlap: {overall['avg_overlap_across_layers']:.3f}")

    print(f"\n🔍 INTERPRETATIONS:")
    print("=" * 50)
    for aspect, interpretation in interpretations.items():
        print(f"{aspect.replace('_', ' ').title()}: {interpretation}")

    # Create visualizations
    print(f"\n📈 Generating Visualizations...")
    analyzer.visualize_layer_sweep_results(results, model1_name, model2_name)

    print(f"\n✅ Analysis Complete!")
    print(f"📁 Results saved to: {analyzer.output_dir}")
    print(f"🧠 Key Finding: {interpretations.get('adaptation_magnitude', 'Analysis completed')}")

    # Memory cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"🔧 Final GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f}GB")

except Exception as e:
    print(f"❌ Error during analysis: {e}")
    import traceback
    traceback.print_exc()

    print("\n💡 Troubleshooting Tips:")
    print("   1. Ensure sufficient GPU memory (8GB+ recommended)")
    print("   2. Reduce LAYERS list or sample size if out of memory")
    print("   3. Check model names are correct and accessible")
    print("   4. Install required packages: pip install sae-lens transformers datasets")


  from .autonotebook import tqdm as notebook_tqdm


🚀 Comprehensive SAE Layer Sweep Analysis: LLM→VLM Adaptation
🔧 Initialized SAE Analyzer
   Device: cuda
   Model Size: 2b
   SAE Width: 16k
   Output Dir: ../figs_tabs

📚 Loading Datasets...
✅ Loaded 666 CIFAR-100 captions
❌ Error loading COCO: 'utf-8' codec can't decode byte 0xc4 in position 4: invalid continuation byte
❌ Error loading LLaVA-Bench: Config name is missing.
Please pick one among the available configs: ['CLEVR-Math(MathV360K)', 'Evol-Instruct-GPT4-Turbo', 'FigureQA(MathV360K)', 'GEOS(MathV360K)', 'GeoQA+(MathV360K)', 'Geometry3K(MathV360K)', 'IconQA(MathV360K)', 'MapQA(MathV360K)', 'MathV360K_TQA', 'MathV360K_VQA-AS', 'MathV360K_VQA-RAD', 'PMC-VQA(MathV360K)', 'Super-CLEVR(MathV360K)', 'TabMWP(MathV360K)', 'UniGeo(MathV360K)', 'VisualWebInstruct(filtered)', 'VizWiz(MathV360K)', 'ai2d(cauldron,llava_format)', 'ai2d(gpt4v)', 'ai2d(internvl)', 'allava_instruct_laion4v', 'allava_instruct_vflan4v', 'aokvqa(cauldron,llava_format)', 'chart2text(cauldron)', 'chartqa(cauldron,lla

Processing layers:   0%|          | 0/25 [00:00<?, ?it/s]


📊 Processing Layer 0
   📥 Loading SAE Layer 0: layer_0/width_16k/canonical



Layer 0 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.15s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.46it/s][A[A

Layer 0 texts:   1%|          | 1/100 [00:14<23:13, 14.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.30s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.32it/s][A[A

Layer 0 texts:   2%|▏         | 2/100 [00:27<22:21, 13.69s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.03s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.63it/s][A[A

Layer 0 texts:   3%|▎         | 3/100 [00:39<21:11, 13.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 0 texts:   4%|▍         | 4/100 [00:52<20:27, 12.79s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.88it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.01it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.71it/s][A[A

Layer 0 texts:   5%|▌         | 5/100 [01:04<19:57, 12.60s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.03s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.66it/s][A[A

Layer 0 texts:   6%|▌         | 6/100 [01:17<19:49, 12.65s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s][A[A

Layer 0 texts:   7%|▋         | 7/100 [01:29<19:17, 12.45s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 0 texts:   8%|▊         | 8/100 [01:41<18:59, 12.38s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.04it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.75it/s][A[A

Layer 0 texts:   9%|▉         | 9/100 [01:54<18:53, 12.45s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.01it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.71it/s][A[A

Layer 0 texts:  10%|█         | 10/100 [02:06<18:37, 12.41s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 0 texts:  11%|█         | 11/100 [02:18<18:13, 12.29s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 0 texts:  12%|█▏        | 12/100 [02:30<17:56, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 0 texts:  13%|█▎        | 13/100 [02:42<17:38, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.08it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A[A

Layer 0 texts:  14%|█▍        | 14/100 [02:54<17:25, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 0 texts:  15%|█▌        | 15/100 [03:06<17:06, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 0 texts:  16%|█▌        | 16/100 [03:18<16:52, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.04it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.76it/s][A[A

Layer 0 texts:  17%|█▋        | 17/100 [03:30<16:44, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.10it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.83it/s][A[A

Layer 0 texts:  18%|█▊        | 18/100 [03:42<16:29, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.46it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 0 texts:  19%|█▉        | 19/100 [03:54<16:14, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  20%|██        | 20/100 [04:06<16:04, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 0 texts:  21%|██        | 21/100 [04:18<15:52, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  22%|██▏       | 22/100 [04:30<15:32, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.10it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 0 texts:  23%|██▎       | 23/100 [04:42<15:21, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.24it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 0 texts:  24%|██▍       | 24/100 [04:54<15:13, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.05it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 0 texts:  25%|██▌       | 25/100 [05:06<14:59, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 0 texts:  26%|██▌       | 26/100 [05:18<14:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 0 texts:  27%|██▋       | 27/100 [05:30<14:31, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.87it/s][A[A

Layer 0 texts:  28%|██▊       | 28/100 [05:42<14:21, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.09it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.84it/s][A[A

Layer 0 texts:  29%|██▉       | 29/100 [05:54<14:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.08it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.81it/s][A[A

Layer 0 texts:  30%|███       | 30/100 [06:07<14:08, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 0 texts:  31%|███       | 31/100 [06:19<13:55, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 0 texts:  32%|███▏      | 32/100 [06:31<13:42, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 0 texts:  33%|███▎      | 33/100 [06:43<13:30, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 0 texts:  34%|███▍      | 34/100 [06:55<13:16, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.55it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  35%|███▌      | 35/100 [07:07<13:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 0 texts:  36%|███▌      | 36/100 [07:19<12:45, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 0 texts:  37%|███▋      | 37/100 [07:30<12:32, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 0 texts:  38%|███▊      | 38/100 [07:43<12:30, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.45it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  39%|███▉      | 39/100 [07:55<12:12, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.37it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 0 texts:  40%|████      | 40/100 [08:06<11:56, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 0 texts:  41%|████      | 41/100 [08:18<11:44, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.43it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 0 texts:  42%|████▏     | 42/100 [08:30<11:29, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 0 texts:  43%|████▎     | 43/100 [08:42<11:20, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 0 texts:  44%|████▍     | 44/100 [08:54<11:08, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 0 texts:  45%|████▌     | 45/100 [09:06<10:56, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 0 texts:  46%|████▌     | 46/100 [09:18<10:44, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 0 texts:  47%|████▋     | 47/100 [09:30<10:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  48%|████▊     | 48/100 [09:42<10:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 0 texts:  49%|████▉     | 49/100 [09:54<10:13, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 0 texts:  50%|█████     | 50/100 [10:06<09:59, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  51%|█████     | 51/100 [10:18<09:49, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 0 texts:  52%|█████▏    | 52/100 [10:30<09:35, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 0 texts:  53%|█████▎    | 53/100 [10:42<09:22, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 0 texts:  54%|█████▍    | 54/100 [10:54<09:10, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 0 texts:  55%|█████▌    | 55/100 [11:06<09:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 0 texts:  56%|█████▌    | 56/100 [11:18<08:46, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 0 texts:  57%|█████▋    | 57/100 [11:30<08:31, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.84it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 0 texts:  58%|█████▊    | 58/100 [11:42<08:20, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.55it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 0 texts:  59%|█████▉    | 59/100 [11:54<08:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 0 texts:  60%|██████    | 60/100 [12:06<07:58, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  61%|██████    | 61/100 [12:18<07:44, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 0 texts:  62%|██████▏   | 62/100 [12:30<07:32, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 0 texts:  63%|██████▎   | 63/100 [12:41<07:19, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 0 texts:  64%|██████▍   | 64/100 [12:53<07:08, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 0 texts:  65%|██████▌   | 65/100 [13:05<06:54, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 0 texts:  66%|██████▌   | 66/100 [13:17<06:42, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 0 texts:  67%|██████▋   | 67/100 [13:29<06:30, 11.82s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  68%|██████▊   | 68/100 [13:40<06:17, 11.81s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.24it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 0 texts:  69%|██████▉   | 69/100 [13:52<06:05, 11.79s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 0 texts:  70%|███████   | 70/100 [14:04<05:52, 11.76s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 0 texts:  71%|███████   | 71/100 [14:16<05:42, 11.82s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 0 texts:  72%|███████▏  | 72/100 [14:28<05:30, 11.80s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.45it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 0 texts:  73%|███████▎  | 73/100 [14:39<05:17, 11.76s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 0 texts:  74%|███████▍  | 74/100 [14:51<05:07, 11.81s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  75%|███████▌  | 75/100 [15:03<04:57, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 0 texts:  76%|███████▌  | 76/100 [15:15<04:44, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 0 texts:  77%|███████▋  | 77/100 [15:27<04:32, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 0 texts:  78%|███████▊  | 78/100 [15:39<04:20, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  79%|███████▉  | 79/100 [15:50<04:07, 11.81s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.13it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 0 texts:  80%|████████  | 80/100 [16:02<03:57, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 0 texts:  81%|████████  | 81/100 [16:14<03:45, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 0 texts:  82%|████████▏ | 82/100 [16:26<03:32, 11.82s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 0 texts:  83%|████████▎ | 83/100 [16:38<03:21, 11.83s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.84it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 0 texts:  84%|████████▍ | 84/100 [16:50<03:10, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 0 texts:  85%|████████▌ | 85/100 [17:02<02:57, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  86%|████████▌ | 86/100 [17:13<02:45, 11.82s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 0 texts:  87%|████████▋ | 87/100 [17:25<02:34, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 0 texts:  88%|████████▊ | 88/100 [17:37<02:22, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 0 texts:  89%|████████▉ | 89/100 [17:49<02:11, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 0 texts:  90%|█████████ | 90/100 [18:01<01:59, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 0 texts:  91%|█████████ | 91/100 [18:13<01:47, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 0 texts:  92%|█████████▏| 92/100 [18:26<01:38, 12.27s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.14it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 0 texts:  93%|█████████▎| 93/100 [18:38<01:25, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 0 texts:  94%|█████████▍| 94/100 [18:50<01:12, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.84it/s][A[A

Layer 0 texts:  95%|█████████▌| 95/100 [19:02<01:00, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 0 texts:  96%|█████████▌| 96/100 [19:14<00:48, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 0 texts:  97%|█████████▋| 97/100 [19:26<00:36, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 0 texts:  98%|█████████▊| 98/100 [19:38<00:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 0 texts:  99%|█████████▉| 99/100 [19:50<00:11, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 0 texts: 100%|██████████| 100/100 [20:02<00:00, 12.07s/it][A
Processing layers:   4%|▍         | 1/25 [20:07<8:02:55, 1207.31s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 0: 12.53GB

📊 Processing Layer 1
   📥 Loading SAE Layer 1: layer_1/width_16k/canonical



Layer 1 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:   1%|          | 1/100 [00:11<19:29, 11.82s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:   2%|▏         | 2/100 [00:24<19:42, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 1 texts:   3%|▎         | 3/100 [00:35<19:22, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:   4%|▍         | 4/100 [00:47<19:05, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 1 texts:   5%|▌         | 5/100 [00:59<18:49, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 1 texts:   6%|▌         | 6/100 [01:11<18:30, 11.81s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:   7%|▋         | 7/100 [01:23<18:23, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 1 texts:   8%|▊         | 8/100 [01:35<18:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:   9%|▉         | 9/100 [01:47<18:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  10%|█         | 10/100 [02:00<18:14, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 1 texts:  11%|█         | 11/100 [02:12<18:03, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  12%|█▏        | 12/100 [02:24<17:42, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.03it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  13%|█▎        | 13/100 [02:35<17:24, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:  14%|█▍        | 14/100 [02:47<17:07, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.92it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.09it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.82it/s][A[A

Layer 1 texts:  15%|█▌        | 15/100 [02:59<16:57, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 1 texts:  16%|█▌        | 16/100 [03:11<16:45, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 1 texts:  17%|█▋        | 17/100 [03:23<16:29, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 1 texts:  18%|█▊        | 18/100 [03:35<16:18, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:  19%|█▉        | 19/100 [03:47<16:07, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  20%|██        | 20/100 [03:59<15:50, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 1 texts:  21%|██        | 21/100 [04:11<15:40, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 1 texts:  22%|██▏       | 22/100 [04:23<15:30, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 1 texts:  23%|██▎       | 23/100 [04:34<15:15, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 1 texts:  24%|██▍       | 24/100 [04:46<15:04, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 1 texts:  25%|██▌       | 25/100 [04:58<14:53, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 1 texts:  26%|██▌       | 26/100 [05:10<14:40, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:  27%|██▋       | 27/100 [05:22<14:29, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  28%|██▊       | 28/100 [05:34<14:16, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:  29%|██▉       | 29/100 [05:46<14:04, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 1 texts:  30%|███       | 30/100 [05:58<13:50, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 1 texts:  31%|███       | 31/100 [06:10<13:39, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.46it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 1 texts:  32%|███▏      | 32/100 [06:22<13:29, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 1 texts:  33%|███▎      | 33/100 [06:33<13:16, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 1 texts:  34%|███▍      | 34/100 [06:45<13:07, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.87it/s][A[A

Layer 1 texts:  35%|███▌      | 35/100 [06:57<12:52, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 1 texts:  36%|███▌      | 36/100 [07:09<12:42, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  37%|███▋      | 37/100 [07:21<12:28, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  38%|███▊      | 38/100 [07:33<12:15, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  39%|███▉      | 39/100 [07:45<12:05, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 1 texts:  40%|████      | 40/100 [07:57<11:52, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 1 texts:  41%|████      | 41/100 [08:08<11:39, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  42%|████▏     | 42/100 [08:20<11:30, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 1 texts:  43%|████▎     | 43/100 [08:32<11:20, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 1 texts:  44%|████▍     | 44/100 [08:44<11:08, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 1 texts:  45%|████▌     | 45/100 [08:56<10:57, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.03it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 1 texts:  46%|████▌     | 46/100 [09:08<10:42, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 1 texts:  47%|████▋     | 47/100 [09:20<10:33, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:  48%|████▊     | 48/100 [09:32<10:21, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 1 texts:  49%|████▉     | 49/100 [09:44<10:07, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 1 texts:  50%|█████     | 50/100 [09:56<09:55, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.92it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  51%|█████     | 51/100 [10:08<09:45, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.04it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 1 texts:  52%|█████▏    | 52/100 [10:20<09:34, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 1 texts:  53%|█████▎    | 53/100 [10:32<09:24, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.05it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  54%|█████▍    | 54/100 [10:44<09:13, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 1 texts:  55%|█████▌    | 55/100 [10:56<08:59, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.93it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  56%|█████▌    | 56/100 [11:08<08:47, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.46it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 1 texts:  57%|█████▋    | 57/100 [11:20<08:37, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  58%|█████▊    | 58/100 [11:32<08:25, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 1 texts:  59%|█████▉    | 59/100 [11:44<08:15, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 1 texts:  60%|██████    | 60/100 [11:56<08:03, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  61%|██████    | 61/100 [12:08<07:49, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  62%|██████▏   | 62/100 [12:20<07:35, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 1 texts:  63%|██████▎   | 63/100 [12:32<07:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.56it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  64%|██████▍   | 64/100 [12:44<07:11, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  65%|██████▌   | 65/100 [12:56<06:59, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 1 texts:  66%|██████▌   | 66/100 [13:08<06:46, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 1 texts:  67%|██████▋   | 67/100 [13:20<06:32, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 54.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:  68%|██████▊   | 68/100 [13:32<06:24, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 1 texts:  69%|██████▉   | 69/100 [13:44<06:11, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 1 texts:  70%|███████   | 70/100 [13:56<05:56, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 1 texts:  71%|███████   | 71/100 [14:07<05:42, 11.82s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 1 texts:  72%|███████▏  | 72/100 [14:19<05:31, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 1 texts:  73%|███████▎  | 73/100 [14:31<05:18, 11.80s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 1 texts:  74%|███████▍  | 74/100 [14:43<05:05, 11.76s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 1 texts:  75%|███████▌  | 75/100 [14:55<04:55, 11.83s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.04it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 1 texts:  76%|███████▌  | 76/100 [15:07<04:44, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  77%|███████▋  | 77/100 [15:19<04:36, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 1 texts:  78%|███████▊  | 78/100 [15:31<04:23, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 1 texts:  79%|███████▉  | 79/100 [15:42<04:09, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 1 texts:  80%|████████  | 80/100 [15:54<03:57, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 1 texts:  81%|████████  | 81/100 [16:07<03:47, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 1 texts:  82%|████████▏ | 82/100 [16:18<03:34, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 1 texts:  83%|████████▎ | 83/100 [16:31<03:24, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.93it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 1 texts:  84%|████████▍ | 84/100 [16:42<03:11, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 1 texts:  85%|████████▌ | 85/100 [16:54<02:59, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 1 texts:  86%|████████▌ | 86/100 [17:06<02:46, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  87%|████████▋ | 87/100 [17:18<02:34, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 1 texts:  88%|████████▊ | 88/100 [17:29<02:21, 11.78s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  89%|████████▉ | 89/100 [17:41<02:09, 11.80s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 1 texts:  90%|█████████ | 90/100 [17:53<01:58, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 1 texts:  91%|█████████ | 91/100 [18:05<01:46, 11.83s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 1 texts:  92%|█████████▏| 92/100 [18:17<01:34, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 1 texts:  93%|█████████▎| 93/100 [18:29<01:22, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 1 texts:  94%|█████████▍| 94/100 [18:41<01:10, 11.82s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 1 texts:  95%|█████████▌| 95/100 [18:52<00:59, 11.80s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 1 texts:  96%|█████████▌| 96/100 [19:04<00:47, 11.80s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 1 texts:  97%|█████████▋| 97/100 [19:17<00:36, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 1 texts:  98%|█████████▊| 98/100 [19:29<00:24, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 1 texts:  99%|█████████▉| 99/100 [19:40<00:11, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 1 texts: 100%|██████████| 100/100 [19:52<00:00, 11.88s/it][A
Processing layers:   8%|▊         | 2/25 [40:01<7:39:47, 1199.45s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 1: 12.83GB

📊 Processing Layer 2
   📥 Loading SAE Layer 2: layer_2/width_16k/canonical



Layer 2 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:   1%|          | 1/100 [00:11<19:40, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.83it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 2 texts:   2%|▏         | 2/100 [00:23<19:21, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 2 texts:   3%|▎         | 3/100 [00:35<19:02, 11.78s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 2 texts:   4%|▍         | 4/100 [00:47<18:57, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 2 texts:   5%|▌         | 5/100 [00:59<18:46, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.57it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 2 texts:   6%|▌         | 6/100 [01:10<18:30, 11.81s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:   7%|▋         | 7/100 [01:23<18:28, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 2 texts:   8%|▊         | 8/100 [01:34<18:15, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.37it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 2 texts:   9%|▉         | 9/100 [01:47<18:08, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  10%|█         | 10/100 [01:58<17:52, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  11%|█         | 11/100 [02:10<17:38, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:  12%|█▏        | 12/100 [02:22<17:26, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  13%|█▎        | 13/100 [02:34<17:11, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  14%|█▍        | 14/100 [02:46<16:58, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.82it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 2 texts:  15%|█▌        | 15/100 [02:58<16:51, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:  16%|█▌        | 16/100 [03:10<16:41, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:  17%|█▋        | 17/100 [03:22<16:29, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  18%|█▊        | 18/100 [03:34<16:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.13it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  19%|█▉        | 19/100 [03:46<16:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  20%|██        | 20/100 [03:58<15:59, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 2 texts:  21%|██        | 21/100 [04:10<15:55, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 2 texts:  22%|██▏       | 22/100 [04:22<15:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.40it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:  23%|██▎       | 23/100 [04:34<15:25, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 2 texts:  24%|██▍       | 24/100 [04:46<15:11, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 2 texts:  25%|██▌       | 25/100 [04:58<15:01, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 2 texts:  26%|██▌       | 26/100 [05:10<14:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 2 texts:  27%|██▋       | 27/100 [05:22<14:40, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.83it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 2 texts:  28%|██▊       | 28/100 [05:34<14:29, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 2 texts:  29%|██▉       | 29/100 [05:46<14:19, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 2 texts:  30%|███       | 30/100 [05:59<14:08, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 2 texts:  31%|███       | 31/100 [06:11<14:01, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 2 texts:  32%|███▏      | 32/100 [06:23<13:48, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 2 texts:  33%|███▎      | 33/100 [06:35<13:35, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 2 texts:  34%|███▍      | 34/100 [06:48<13:29, 12.26s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:  35%|███▌      | 35/100 [07:00<13:12, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:  36%|███▌      | 36/100 [07:12<12:56, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:  37%|███▋      | 37/100 [07:24<12:45, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:  38%|███▊      | 38/100 [07:36<12:33, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 2 texts:  39%|███▉      | 39/100 [07:48<12:19, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 2 texts:  40%|████      | 40/100 [08:00<12:03, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 2 texts:  41%|████      | 41/100 [08:12<11:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 2 texts:  42%|████▏     | 42/100 [08:24<11:37, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 2 texts:  43%|████▎     | 43/100 [08:36<11:26, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 2 texts:  44%|████▍     | 44/100 [08:48<11:14, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 2 texts:  45%|████▌     | 45/100 [09:00<11:02, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  46%|████▌     | 46/100 [09:12<10:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  47%|████▋     | 47/100 [09:24<10:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  48%|████▊     | 48/100 [09:36<10:25, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  49%|████▉     | 49/100 [09:48<10:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:  50%|█████     | 50/100 [10:00<10:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  51%|█████     | 51/100 [10:12<09:47, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:  52%|█████▏    | 52/100 [10:24<09:35, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:  53%|█████▎    | 53/100 [10:36<09:23, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 2 texts:  54%|█████▍    | 54/100 [10:48<09:13, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  55%|█████▌    | 55/100 [11:00<09:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:  56%|█████▌    | 56/100 [11:12<08:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 53.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  57%|█████▋    | 57/100 [11:24<08:32, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  58%|█████▊    | 58/100 [11:36<08:17, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  59%|█████▉    | 59/100 [11:48<08:07, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 2 texts:  60%|██████    | 60/100 [12:00<07:57, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.40it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 2 texts:  61%|██████    | 61/100 [12:12<07:45, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 2 texts:  62%|██████▏   | 62/100 [12:23<07:33, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.55it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  63%|██████▎   | 63/100 [12:35<07:21, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:  64%|██████▍   | 64/100 [12:47<07:10, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 2 texts:  65%|██████▌   | 65/100 [12:59<06:58, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.13it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 2 texts:  66%|██████▌   | 66/100 [13:11<06:44, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 2 texts:  67%|██████▋   | 67/100 [13:23<06:33, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 2 texts:  68%|██████▊   | 68/100 [13:35<06:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 2 texts:  69%|██████▉   | 69/100 [13:47<06:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 2 texts:  70%|███████   | 70/100 [13:59<06:01, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 2 texts:  71%|███████   | 71/100 [14:11<05:48, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  72%|███████▏  | 72/100 [14:23<05:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 2 texts:  73%|███████▎  | 73/100 [14:35<05:24, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 2 texts:  74%|███████▍  | 74/100 [14:48<05:14, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.45it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 2 texts:  75%|███████▌  | 75/100 [15:00<05:01, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 2 texts:  76%|███████▌  | 76/100 [15:12<04:49, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 2 texts:  77%|███████▋  | 77/100 [15:24<04:38, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.87it/s][A[A

Layer 2 texts:  78%|███████▊  | 78/100 [15:36<04:26, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 2 texts:  79%|███████▉  | 79/100 [15:48<04:13, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.93it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 2 texts:  80%|████████  | 80/100 [16:01<04:03, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 2 texts:  81%|████████  | 81/100 [16:13<03:50, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 2 texts:  82%|████████▏ | 82/100 [16:25<03:38, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 2 texts:  83%|████████▎ | 83/100 [16:37<03:25, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 2 texts:  84%|████████▍ | 84/100 [16:49<03:14, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 2 texts:  85%|████████▌ | 85/100 [17:01<03:01, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:  86%|████████▌ | 86/100 [17:13<02:49, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:  87%|████████▋ | 87/100 [17:25<02:36, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 2 texts:  88%|████████▊ | 88/100 [17:37<02:24, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  89%|████████▉ | 89/100 [17:49<02:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.14it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts:  90%|█████████ | 90/100 [18:01<01:59, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 2 texts:  91%|█████████ | 91/100 [18:13<01:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 2 texts:  92%|█████████▏| 92/100 [18:25<01:36, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:  93%|█████████▎| 93/100 [18:37<01:24, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 2 texts:  94%|█████████▍| 94/100 [18:49<01:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 2 texts:  95%|█████████▌| 95/100 [19:01<01:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 2 texts:  96%|█████████▌| 96/100 [19:13<00:47, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 2 texts:  97%|█████████▋| 97/100 [19:25<00:36, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 2 texts:  98%|█████████▊| 98/100 [19:37<00:24, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 2 texts:  99%|█████████▉| 99/100 [19:49<00:12, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 2 texts: 100%|██████████| 100/100 [20:01<00:00, 12.05s/it][A
Processing layers:  12%|█▏        | 3/25 [1:00:04<7:20:24, 1201.12s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 2: 12.83GB

📊 Processing Layer 3
   📥 Loading SAE Layer 3: layer_3/width_16k/canonical



Layer 3 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 3 texts:   1%|          | 1/100 [00:11<19:44, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 3 texts:   2%|▏         | 2/100 [00:24<19:42, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:   3%|▎         | 3/100 [00:36<19:32, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 3 texts:   4%|▍         | 4/100 [00:48<19:23, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:   5%|▌         | 5/100 [01:00<19:07, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:   6%|▌         | 6/100 [01:12<19:00, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:   7%|▋         | 7/100 [01:24<18:43, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 3 texts:   8%|▊         | 8/100 [01:36<18:28, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 3 texts:   9%|▉         | 9/100 [01:48<18:11, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 3 texts:  10%|█         | 10/100 [02:00<18:10, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 3 texts:  11%|█         | 11/100 [02:12<17:53, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.14it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:  12%|█▏        | 12/100 [02:24<17:41, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 3 texts:  13%|█▎        | 13/100 [02:36<17:25, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 52.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 3 texts:  14%|█▍        | 14/100 [02:49<17:19, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 3 texts:  15%|█▌        | 15/100 [03:00<17:03, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 3 texts:  16%|█▌        | 16/100 [03:13<16:56, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  17%|█▋        | 17/100 [03:25<16:38, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 3 texts:  18%|█▊        | 18/100 [03:37<16:26, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  19%|█▉        | 19/100 [03:49<16:17, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 3 texts:  20%|██        | 20/100 [04:01<16:01, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 3 texts:  21%|██        | 21/100 [04:13<15:46, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 3 texts:  22%|██▏       | 22/100 [04:24<15:33, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 3 texts:  23%|██▎       | 23/100 [04:36<15:19, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 3 texts:  24%|██▍       | 24/100 [04:49<15:16, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 3 texts:  25%|██▌       | 25/100 [05:01<15:01, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:  26%|██▌       | 26/100 [05:12<14:44, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 3 texts:  27%|██▋       | 27/100 [05:24<14:31, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 3 texts:  28%|██▊       | 28/100 [05:36<14:21, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:  29%|██▉       | 29/100 [05:48<14:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:  30%|███       | 30/100 [06:01<14:03, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 3 texts:  31%|███       | 31/100 [06:12<13:44, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.84it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 3 texts:  32%|███▏      | 32/100 [06:24<13:30, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 3 texts:  33%|███▎      | 33/100 [06:36<13:21, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 3 texts:  34%|███▍      | 34/100 [06:48<13:09, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.93it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  35%|███▌      | 35/100 [07:00<12:57, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 3 texts:  36%|███▌      | 36/100 [07:12<12:42, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 3 texts:  37%|███▋      | 37/100 [07:24<12:31, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 3 texts:  38%|███▊      | 38/100 [07:36<12:20, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 3 texts:  39%|███▉      | 39/100 [07:48<12:14, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.43it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 3 texts:  40%|████      | 40/100 [08:00<12:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  41%|████      | 41/100 [08:12<11:45, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:  42%|████▏     | 42/100 [08:24<11:31, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.40it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 3 texts:  43%|████▎     | 43/100 [08:36<11:20, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.57it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 3 texts:  44%|████▍     | 44/100 [08:48<11:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.22it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 3 texts:  45%|████▌     | 45/100 [09:00<10:56, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 3 texts:  46%|████▌     | 46/100 [09:11<10:42, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 3 texts:  47%|████▋     | 47/100 [09:23<10:30, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 3 texts:  48%|████▊     | 48/100 [09:35<10:18, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  49%|████▉     | 49/100 [09:47<10:07, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 3 texts:  50%|█████     | 50/100 [09:59<09:56, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:  51%|█████     | 51/100 [10:11<09:46, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 3 texts:  52%|█████▏    | 52/100 [10:23<09:33, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 3 texts:  53%|█████▎    | 53/100 [10:35<09:24, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 3 texts:  54%|█████▍    | 54/100 [10:48<09:17, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 3 texts:  55%|█████▌    | 55/100 [11:00<09:05, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  56%|█████▌    | 56/100 [11:12<08:51, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  57%|█████▋    | 57/100 [11:24<08:37, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.84it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  58%|█████▊    | 58/100 [11:36<08:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.56it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 3 texts:  59%|█████▉    | 59/100 [11:48<08:18, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 3 texts:  60%|██████    | 60/100 [12:00<08:04, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:  61%|██████    | 61/100 [12:12<07:49, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:  62%|██████▏   | 62/100 [12:24<07:39, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 3 texts:  63%|██████▎   | 63/100 [12:36<07:27, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 3 texts:  64%|██████▍   | 64/100 [12:48<07:13, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:  65%|██████▌   | 65/100 [13:00<07:02, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  66%|██████▌   | 66/100 [13:12<06:49, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.83it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 3 texts:  67%|██████▋   | 67/100 [13:24<06:38, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 3 texts:  68%|██████▊   | 68/100 [13:37<06:25, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:  69%|██████▉   | 69/100 [13:49<06:13, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 3 texts:  70%|███████   | 70/100 [14:01<06:02, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  71%|███████   | 71/100 [14:13<05:49, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 3 texts:  72%|███████▏  | 72/100 [14:25<05:37, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:  73%|███████▎  | 73/100 [14:37<05:25, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 3 texts:  74%|███████▍  | 74/100 [14:49<05:13, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.93it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 3 texts:  75%|███████▌  | 75/100 [15:01<05:01, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 3 texts:  76%|███████▌  | 76/100 [15:13<04:49, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 3 texts:  77%|███████▋  | 77/100 [15:25<04:39, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 3 texts:  78%|███████▊  | 78/100 [15:37<04:26, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.88it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 3 texts:  79%|███████▉  | 79/100 [15:50<04:15, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:  80%|████████  | 80/100 [16:02<04:03, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 3 texts:  81%|████████  | 81/100 [16:14<03:49, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 3 texts:  82%|████████▏ | 82/100 [16:26<03:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 3 texts:  83%|████████▎ | 83/100 [16:38<03:24, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 3 texts:  84%|████████▍ | 84/100 [16:50<03:13, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 3 texts:  85%|████████▌ | 85/100 [17:02<03:01, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.13it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 3 texts:  86%|████████▌ | 86/100 [17:14<02:49, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:  87%|████████▋ | 87/100 [17:27<02:38, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 3 texts:  88%|████████▊ | 88/100 [17:39<02:26, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 3 texts:  89%|████████▉ | 89/100 [17:51<02:13, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 3 texts:  90%|█████████ | 90/100 [18:03<02:01, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 3 texts:  91%|█████████ | 91/100 [18:15<01:49, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 3 texts:  92%|█████████▏| 92/100 [18:27<01:36, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:  93%|█████████▎| 93/100 [18:39<01:24, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:  94%|█████████▍| 94/100 [18:51<01:12, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 3 texts:  95%|█████████▌| 95/100 [19:03<01:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts:  96%|█████████▌| 96/100 [19:15<00:47, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 3 texts:  97%|█████████▋| 97/100 [19:27<00:35, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 3 texts:  98%|█████████▊| 98/100 [19:38<00:23, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 3 texts:  99%|█████████▉| 99/100 [19:50<00:11, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 3 texts: 100%|██████████| 100/100 [20:03<00:00, 12.00s/it][A
Processing layers:  16%|█▌        | 4/25 [1:20:08<7:00:47, 1202.26s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 3: 12.83GB

📊 Processing Layer 4
   📥 Loading SAE Layer 4: layer_4/width_16k/canonical



Layer 4 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:   1%|          | 1/100 [00:11<19:24, 11.76s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:   2%|▏         | 2/100 [00:23<19:17, 11.81s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:   3%|▎         | 3/100 [00:35<19:11, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:   4%|▍         | 4/100 [00:47<18:57, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:   5%|▌         | 5/100 [00:59<18:48, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:   6%|▌         | 6/100 [01:11<18:46, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 4 texts:   7%|▋         | 7/100 [01:23<18:36, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:   8%|▊         | 8/100 [01:35<18:26, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.37it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:   9%|▉         | 9/100 [01:47<18:14, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  10%|█         | 10/100 [01:59<18:07, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 4 texts:  11%|█         | 11/100 [02:11<17:51, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 4 texts:  12%|█▏        | 12/100 [02:23<17:39, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.55it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:  13%|█▎        | 13/100 [02:35<17:26, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  14%|█▍        | 14/100 [02:47<17:09, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  15%|█▌        | 15/100 [02:59<16:58, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 4 texts:  16%|█▌        | 16/100 [03:11<16:46, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 4 texts:  17%|█▋        | 17/100 [03:23<16:35, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 4 texts:  18%|█▊        | 18/100 [03:35<16:24, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.45it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  19%|█▉        | 19/100 [03:47<16:07, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  20%|██        | 20/100 [03:59<15:55, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:  21%|██        | 21/100 [04:11<15:43, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 4 texts:  22%|██▏       | 22/100 [04:23<15:34, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 4 texts:  23%|██▎       | 23/100 [04:35<15:19, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s][A[A

Layer 4 texts:  24%|██▍       | 24/100 [04:47<15:07, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.03it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  25%|██▌       | 25/100 [04:59<14:57, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 4 texts:  26%|██▌       | 26/100 [05:11<14:45, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  27%|██▋       | 27/100 [05:23<14:30, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  28%|██▊       | 28/100 [05:34<14:17, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 4 texts:  29%|██▉       | 29/100 [05:46<14:02, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.14it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  30%|███       | 30/100 [05:58<13:49, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 4 texts:  31%|███       | 31/100 [06:10<13:40, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 4 texts:  32%|███▏      | 32/100 [06:22<13:29, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 4 texts:  33%|███▎      | 33/100 [06:34<13:18, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  34%|███▍      | 34/100 [06:46<13:06, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s][A[A

Layer 4 texts:  35%|███▌      | 35/100 [06:58<12:53, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.04it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 4 texts:  36%|███▌      | 36/100 [07:10<12:42, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:  37%|███▋      | 37/100 [07:22<12:30, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  38%|███▊      | 38/100 [07:33<12:18, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  39%|███▉      | 39/100 [07:46<12:10, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:  40%|████      | 40/100 [07:57<11:56, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  41%|████      | 41/100 [08:09<11:42, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.84it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 4 texts:  42%|████▏     | 42/100 [08:21<11:27, 11.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 4 texts:  43%|████▎     | 43/100 [08:33<11:16, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 4 texts:  44%|████▍     | 44/100 [08:45<11:06, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 4 texts:  45%|████▌     | 45/100 [08:57<10:52, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 4 texts:  46%|████▌     | 46/100 [09:08<10:38, 11.82s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 4 texts:  47%|████▋     | 47/100 [09:20<10:27, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  48%|████▊     | 48/100 [09:32<10:17, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  49%|████▉     | 49/100 [09:44<10:03, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  50%|█████     | 50/100 [09:56<09:54, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  51%|█████     | 51/100 [10:08<09:43, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  52%|█████▏    | 52/100 [10:20<09:30, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  53%|█████▎    | 53/100 [10:32<09:18, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  62%|██████▏   | 62/100 [12:19<07:37, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  63%|██████▎   | 63/100 [12:31<07:24, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.24it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  64%|██████▍   | 64/100 [12:43<07:10, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.74it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 4 texts:  65%|██████▌   | 65/100 [12:55<06:58, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  66%|██████▌   | 66/100 [13:07<06:45, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 4 texts:  67%|██████▋   | 67/100 [13:19<06:33, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  68%|██████▊   | 68/100 [13:31<06:21, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  69%|██████▉   | 69/100 [13:43<06:11, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  70%|███████   | 70/100 [13:55<05:59, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 4 texts:  71%|███████   | 71/100 [14:07<05:46, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  72%|███████▏  | 72/100 [14:18<05:33, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.04it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  73%|███████▎  | 73/100 [14:30<05:21, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 4 texts:  74%|███████▍  | 74/100 [14:42<05:10, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 4 texts:  75%|███████▌  | 75/100 [14:54<04:59, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 4 texts:  76%|███████▌  | 76/100 [15:07<04:50, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  77%|███████▋  | 77/100 [15:19<04:36, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 4 texts:  78%|███████▊  | 78/100 [15:31<04:24, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 4 texts:  79%|███████▉  | 79/100 [15:43<04:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.43it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 4 texts:  80%|████████  | 80/100 [15:55<04:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  81%|████████  | 81/100 [16:07<03:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:  82%|████████▏ | 82/100 [16:18<03:34, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  83%|████████▎ | 83/100 [16:30<03:21, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 4 texts:  84%|████████▍ | 84/100 [16:42<03:10, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 4 texts:  85%|████████▌ | 85/100 [16:54<02:58, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.82it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  86%|████████▌ | 86/100 [17:06<02:47, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.43it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 4 texts:  87%|████████▋ | 87/100 [17:18<02:34, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 4 texts:  88%|████████▊ | 88/100 [17:30<02:22, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  89%|████████▉ | 89/100 [17:42<02:10, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 4 texts:  90%|█████████ | 90/100 [17:53<01:58, 11.84s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 4 texts:  91%|█████████ | 91/100 [18:05<01:46, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 4 texts:  92%|█████████▏| 92/100 [18:17<01:35, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 4 texts:  93%|█████████▎| 93/100 [18:29<01:23, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 4 texts:  94%|█████████▍| 94/100 [18:41<01:11, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  95%|█████████▌| 95/100 [18:53<00:59, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:  96%|█████████▌| 96/100 [19:05<00:47, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 4 texts:  97%|█████████▋| 97/100 [19:17<00:35, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 4 texts:  98%|█████████▊| 98/100 [19:29<00:23, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts:  99%|█████████▉| 99/100 [19:41<00:11, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 4 texts: 100%|██████████| 100/100 [19:53<00:00, 11.96s/it][A
Processing layers:  20%|██        | 5/25 [1:40:02<6:39:47, 1199.36s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 4: 12.83GB

📊 Processing Layer 5
   📥 Loading SAE Layer 5: layer_5/width_16k/canonical



Layer 5 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 5 texts:   1%|          | 1/100 [00:11<19:42, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 5 texts:   2%|▏         | 2/100 [00:23<19:31, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:   3%|▎         | 3/100 [00:35<19:22, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:   4%|▍         | 4/100 [00:47<19:09, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:   5%|▌         | 5/100 [00:59<18:56, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:   6%|▌         | 6/100 [01:12<18:54, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:   7%|▋         | 7/100 [01:24<18:46, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.37it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 5 texts:   8%|▊         | 8/100 [01:36<18:37, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:   9%|▉         | 9/100 [01:48<18:19, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 5 texts:  10%|█         | 10/100 [02:00<17:58, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 5 texts:  11%|█         | 11/100 [02:12<17:42, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 5 texts:  12%|█▏        | 12/100 [02:24<17:33, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.83it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:  13%|█▎        | 13/100 [02:35<17:18, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:  14%|█▍        | 14/100 [02:47<17:08, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 5 texts:  15%|█▌        | 15/100 [02:59<16:53, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 5 texts:  16%|█▌        | 16/100 [03:11<16:40, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 5 texts:  17%|█▋        | 17/100 [03:23<16:31, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 5 texts:  18%|█▊        | 18/100 [03:35<16:16, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 5 texts:  19%|█▉        | 19/100 [03:47<16:04, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 5 texts:  20%|██        | 20/100 [03:59<15:54, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  21%|██        | 21/100 [04:11<15:40, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 5 texts:  22%|██▏       | 22/100 [04:23<15:27, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 5 texts:  23%|██▎       | 23/100 [04:35<15:16, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  24%|██▍       | 24/100 [04:47<15:10, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:  25%|██▌       | 25/100 [04:59<15:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 5 texts:  26%|██▌       | 26/100 [05:11<14:49, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  27%|██▋       | 27/100 [05:23<14:43, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 5 texts:  28%|██▊       | 28/100 [05:35<14:26, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.57it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:  29%|██▉       | 29/100 [05:47<14:13, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:  30%|███       | 30/100 [05:59<14:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 5 texts:  31%|███       | 31/100 [06:11<13:45, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:  32%|███▏      | 32/100 [06:23<13:32, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 5 texts:  33%|███▎      | 33/100 [06:35<13:19, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 5 texts:  34%|███▍      | 34/100 [06:47<13:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 5 texts:  35%|███▌      | 35/100 [06:59<12:58, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 5 texts:  36%|███▌      | 36/100 [07:11<12:51, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 5 texts:  37%|███▋      | 37/100 [07:23<12:32, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:  38%|███▊      | 38/100 [07:35<12:20, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 5 texts:  39%|███▉      | 39/100 [07:47<12:09, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 5 texts:  40%|████      | 40/100 [07:58<11:56, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 5 texts:  41%|████      | 41/100 [08:11<11:55, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 5 texts:  42%|████▏     | 42/100 [08:23<11:41, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  43%|████▎     | 43/100 [08:35<11:25, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 5 texts:  44%|████▍     | 44/100 [08:47<11:14, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 5 texts:  45%|████▌     | 45/100 [08:59<11:02, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 5 texts:  46%|████▌     | 46/100 [09:11<10:53, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.82it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 5 texts:  47%|████▋     | 47/100 [09:23<10:38, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  48%|████▊     | 48/100 [09:35<10:26, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 5 texts:  49%|████▉     | 49/100 [09:47<10:15, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 5 texts:  50%|█████     | 50/100 [09:59<10:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 5 texts:  51%|█████     | 51/100 [10:11<09:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 5 texts:  52%|█████▏    | 52/100 [10:23<09:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  53%|█████▎    | 53/100 [10:35<09:21, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:  54%|█████▍    | 54/100 [10:47<09:07, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 5 texts:  55%|█████▌    | 55/100 [10:59<08:58, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:  56%|█████▌    | 56/100 [11:11<08:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 5 texts:  57%|█████▋    | 57/100 [11:23<08:35, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:  58%|█████▊    | 58/100 [11:35<08:20, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 5 texts:  59%|█████▉    | 59/100 [11:47<08:09, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.43it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 5 texts:  60%|██████    | 60/100 [11:59<07:58, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 5 texts:  61%|██████    | 61/100 [12:11<07:45, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  62%|██████▏   | 62/100 [12:23<07:33, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.56it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 5 texts:  63%|██████▎   | 63/100 [12:34<07:21, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 5 texts:  64%|██████▍   | 64/100 [12:46<07:08, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 5 texts:  65%|██████▌   | 65/100 [12:58<06:56, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:  66%|██████▌   | 66/100 [13:10<06:44, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.05it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s][A[A

Layer 5 texts:  67%|██████▋   | 67/100 [13:22<06:35, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  68%|██████▊   | 68/100 [13:34<06:21, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:  69%|██████▉   | 69/100 [13:46<06:09, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  70%|███████   | 70/100 [13:58<05:59, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  71%|███████   | 71/100 [14:10<05:45, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:  72%|███████▏  | 72/100 [14:22<05:33, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 5 texts:  73%|███████▎  | 73/100 [14:34<05:22, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 5 texts:  74%|███████▍  | 74/100 [14:46<05:09, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.46it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.09it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.83it/s][A[A

Layer 5 texts:  75%|███████▌  | 75/100 [14:57<04:57, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.82it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 5 texts:  76%|███████▌  | 76/100 [15:09<04:45, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.88it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 5 texts:  77%|███████▋  | 77/100 [15:21<04:34, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:  78%|███████▊  | 78/100 [15:33<04:22, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 5 texts:  79%|███████▉  | 79/100 [15:45<04:10, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 5 texts:  80%|████████  | 80/100 [15:57<03:57, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 5 texts:  81%|████████  | 81/100 [16:09<03:45, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 5 texts:  82%|████████▏ | 82/100 [16:21<03:34, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 5 texts:  83%|████████▎ | 83/100 [16:33<03:23, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.05it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:  84%|████████▍ | 84/100 [16:45<03:11, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 5 texts:  85%|████████▌ | 85/100 [16:57<03:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 5 texts:  86%|████████▌ | 86/100 [17:09<02:49, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 5 texts:  87%|████████▋ | 87/100 [17:21<02:36, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 5 texts:  88%|████████▊ | 88/100 [17:33<02:25, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.24it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 5 texts:  89%|████████▉ | 89/100 [17:46<02:13, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 5 texts:  90%|█████████ | 90/100 [17:58<02:01, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 5 texts:  91%|█████████ | 91/100 [18:10<01:49, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 5 texts:  92%|█████████▏| 92/100 [18:22<01:37, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 5 texts:  93%|█████████▎| 93/100 [18:34<01:24, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.55it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 5 texts:  94%|█████████▍| 94/100 [18:46<01:12, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 5 texts:  95%|█████████▌| 95/100 [18:59<01:01, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 5 texts:  96%|█████████▌| 96/100 [19:11<00:48, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 5 texts:  97%|█████████▋| 97/100 [19:23<00:36, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 5 texts:  98%|█████████▊| 98/100 [19:35<00:24, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 5 texts:  99%|█████████▉| 99/100 [19:47<00:12, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 5 texts: 100%|██████████| 100/100 [19:59<00:00, 12.15s/it][A
Processing layers:  24%|██▍       | 6/25 [2:00:03<6:19:56, 1199.84s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 5: 12.83GB

📊 Processing Layer 6
   📥 Loading SAE Layer 6: layer_6/width_16k/canonical



Layer 6 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 6 texts:   1%|          | 1/100 [00:12<20:22, 12.35s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 6 texts:   2%|▏         | 2/100 [00:24<19:47, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.92it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 6 texts:   3%|▎         | 3/100 [00:36<19:39, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:   4%|▍         | 4/100 [00:48<19:28, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 6 texts:   5%|▌         | 5/100 [01:00<19:19, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 6 texts:   6%|▌         | 6/100 [01:13<19:05, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:   7%|▋         | 7/100 [01:25<18:45, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 6 texts:   8%|▊         | 8/100 [01:37<18:29, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:   9%|▉         | 9/100 [01:49<18:21, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 6 texts:  10%|█         | 10/100 [02:01<18:02, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.56it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 6 texts:  11%|█         | 11/100 [02:13<17:49, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  12%|█▏        | 12/100 [02:24<17:34, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  13%|█▎        | 13/100 [02:36<17:22, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 6 texts:  14%|█▍        | 14/100 [02:48<17:08, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 6 texts:  15%|█▌        | 15/100 [03:01<17:02, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 6 texts:  16%|█▌        | 16/100 [03:12<16:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 43.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 6 texts:  17%|█▋        | 17/100 [03:25<16:39, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.57it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  18%|█▊        | 18/100 [03:37<16:32, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 6 texts:  19%|█▉        | 19/100 [03:49<16:19, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  20%|██        | 20/100 [04:01<16:05, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 6 texts:  21%|██        | 21/100 [04:13<15:50, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 6 texts:  22%|██▏       | 22/100 [04:25<15:45, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 6 texts:  23%|██▎       | 23/100 [04:37<15:33, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 6 texts:  24%|██▍       | 24/100 [04:49<15:15, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.74it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  25%|██▌       | 25/100 [05:01<15:03, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 6 texts:  26%|██▌       | 26/100 [05:14<14:57, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  27%|██▋       | 27/100 [05:26<14:42, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 6 texts:  28%|██▊       | 28/100 [05:38<14:32, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  29%|██▉       | 29/100 [05:51<14:52, 12.58s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 6 texts:  30%|███       | 30/100 [06:04<14:31, 12.45s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  31%|███       | 31/100 [06:17<14:31, 12.62s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 6 texts:  32%|███▏      | 32/100 [06:30<14:33, 12.85s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 6 texts:  33%|███▎      | 33/100 [06:43<14:14, 12.76s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 6 texts:  34%|███▍      | 34/100 [06:55<13:55, 12.65s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 6 texts:  35%|███▌      | 35/100 [07:07<13:34, 12.53s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 6 texts:  36%|███▌      | 36/100 [07:19<13:14, 12.41s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  37%|███▋      | 37/100 [07:31<12:55, 12.31s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 6 texts:  38%|███▊      | 38/100 [07:43<12:39, 12.25s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  39%|███▉      | 39/100 [07:56<12:24, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  40%|████      | 40/100 [08:08<12:10, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 6 texts:  41%|████      | 41/100 [08:20<11:53, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 6 texts:  42%|████▏     | 42/100 [08:32<11:42, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.88it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  43%|████▎     | 43/100 [08:44<11:29, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 6 texts:  44%|████▍     | 44/100 [08:56<11:22, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 6 texts:  45%|████▌     | 45/100 [09:09<11:12, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 6 texts:  46%|████▌     | 46/100 [09:21<11:00, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 6 texts:  47%|████▋     | 47/100 [09:33<10:46, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  48%|████▊     | 48/100 [09:45<10:29, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 6 texts:  49%|████▉     | 49/100 [09:57<10:17, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 6 texts:  50%|█████     | 50/100 [10:09<10:03, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 6 texts:  51%|█████     | 51/100 [10:21<09:50, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 6 texts:  52%|█████▏    | 52/100 [10:33<09:38, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 6 texts:  53%|█████▎    | 53/100 [10:45<09:27, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 6 texts:  54%|█████▍    | 54/100 [10:57<09:19, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 6 texts:  55%|█████▌    | 55/100 [11:09<09:04, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  56%|█████▌    | 56/100 [11:21<08:52, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 6 texts:  57%|█████▋    | 57/100 [11:34<08:39, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 6 texts:  58%|█████▊    | 58/100 [11:46<08:29, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 6 texts:  59%|█████▉    | 59/100 [11:58<08:16, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.56it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 6 texts:  60%|██████    | 60/100 [12:10<08:03, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 6 texts:  61%|██████    | 61/100 [12:22<07:51, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 6 texts:  62%|██████▏   | 62/100 [12:34<07:41, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.03it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  63%|██████▎   | 63/100 [12:46<07:28, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  64%|██████▍   | 64/100 [12:58<07:16, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  65%|██████▌   | 65/100 [13:10<07:03, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.03it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  66%|██████▌   | 66/100 [13:23<06:51, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  67%|██████▋   | 67/100 [13:35<06:39, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 6 texts:  68%|██████▊   | 68/100 [13:47<06:31, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 6 texts:  69%|██████▉   | 69/100 [13:59<06:17, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  70%|███████   | 70/100 [14:12<06:11, 12.38s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  71%|███████   | 71/100 [14:26<06:14, 12.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  72%|███████▏  | 72/100 [14:40<06:08, 13.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  73%|███████▎  | 73/100 [14:52<05:48, 12.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.46it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 6 texts:  74%|███████▍  | 74/100 [15:05<05:32, 12.79s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  75%|███████▌  | 75/100 [15:17<05:18, 12.73s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.29it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 6 texts:  76%|███████▌  | 76/100 [15:30<05:01, 12.57s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  77%|███████▋  | 77/100 [15:42<04:46, 12.46s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 6 texts:  78%|███████▊  | 78/100 [15:54<04:32, 12.39s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 6 texts:  79%|███████▉  | 79/100 [16:06<04:19, 12.36s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 6 texts:  80%|████████  | 80/100 [16:19<04:06, 12.34s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 6 texts:  81%|████████  | 81/100 [16:31<03:54, 12.32s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 6 texts:  82%|████████▏ | 82/100 [16:43<03:40, 12.25s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  83%|████████▎ | 83/100 [16:55<03:28, 12.27s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.40it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 6 texts:  84%|████████▍ | 84/100 [17:07<03:14, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.69it/s][A[A

Layer 6 texts:  85%|████████▌ | 85/100 [17:20<03:03, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.06it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.79it/s][A[A

Layer 6 texts:  86%|████████▌ | 86/100 [17:32<02:51, 12.26s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.01it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.72it/s][A[A

Layer 6 texts:  87%|████████▋ | 87/100 [17:44<02:39, 12.25s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  88%|████████▊ | 88/100 [17:56<02:26, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.22it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.73it/s][A[A

Layer 6 texts:  89%|████████▉ | 89/100 [18:09<02:14, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.04it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.77it/s][A[A

Layer 6 texts:  90%|█████████ | 90/100 [18:21<02:02, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][A[A

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 21.37it/s][A[A


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 6 texts:  91%|█████████ | 91/100 [18:33<01:50, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.88it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  92%|█████████▏| 92/100 [18:45<01:37, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 6 texts:  93%|█████████▎| 93/100 [18:57<01:25, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.87it/s][A[A

Layer 6 texts:  94%|█████████▍| 94/100 [19:10<01:13, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 6 texts:  95%|█████████▌| 95/100 [19:22<01:01, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts:  96%|█████████▌| 96/100 [19:34<00:48, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 6 texts:  97%|█████████▋| 97/100 [19:46<00:36, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 6 texts:  98%|█████████▊| 98/100 [19:58<00:24, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A[A

Layer 6 texts:  99%|█████████▉| 99/100 [20:11<00:12, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 6 texts: 100%|██████████| 100/100 [20:23<00:00, 12.22s/it][A
Processing layers:  28%|██▊       | 7/25 [2:20:27<6:02:21, 1207.87s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 6: 12.83GB

📊 Processing Layer 7
   📥 Loading SAE Layer 7: layer_7/width_16k/canonical



Layer 7 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.55it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 7 texts:   1%|          | 1/100 [00:12<20:04, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.09it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.82it/s][A[A

Layer 7 texts:   2%|▏         | 2/100 [00:24<19:54, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 7 texts:   3%|▎         | 3/100 [00:36<19:32, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 7 texts:   4%|▍         | 4/100 [00:48<19:28, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.45it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 7 texts:   5%|▌         | 5/100 [01:00<19:12, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.04it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 7 texts:   6%|▌         | 6/100 [01:12<18:52, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.10it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.83it/s][A[A

Layer 7 texts:   7%|▋         | 7/100 [01:24<18:42, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 7 texts:   8%|▊         | 8/100 [01:36<18:33, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 7 texts:   9%|▉         | 9/100 [01:49<18:25, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 7 texts:  10%|█         | 10/100 [02:01<18:17, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 7 texts:  11%|█         | 11/100 [02:13<18:07, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.40it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.87it/s][A[A

Layer 7 texts:  12%|█▏        | 12/100 [02:25<17:51, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 7 texts:  13%|█▎        | 13/100 [02:37<17:34, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 7 texts:  14%|█▍        | 14/100 [02:49<17:14, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 7 texts:  15%|█▌        | 15/100 [03:01<17:07, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 7 texts:  16%|█▌        | 16/100 [03:13<16:52, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 7 texts:  17%|█▋        | 17/100 [03:25<16:37, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 7 texts:  18%|█▊        | 18/100 [03:37<16:19, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 7 texts:  19%|█▉        | 19/100 [03:49<16:00, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  20%|██        | 20/100 [04:01<15:52, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 7 texts:  21%|██        | 21/100 [04:13<15:45, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s][A[A

Layer 7 texts:  22%|██▏       | 22/100 [04:25<15:42, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 7 texts:  23%|██▎       | 23/100 [04:37<15:23, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 7 texts:  24%|██▍       | 24/100 [04:49<15:24, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  25%|██▌       | 25/100 [05:02<15:13, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 7 texts:  26%|██▌       | 26/100 [05:14<15:04, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 7 texts:  27%|██▋       | 27/100 [05:26<14:57, 12.29s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.88it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  28%|██▊       | 28/100 [05:38<14:35, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 7 texts:  29%|██▉       | 29/100 [05:50<14:22, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 7 texts:  30%|███       | 30/100 [06:02<14:04, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 7 texts:  31%|███       | 31/100 [06:14<13:46, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 7 texts:  32%|███▏      | 32/100 [06:26<13:37, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 7 texts:  33%|███▎      | 33/100 [06:38<13:25, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 7 texts:  34%|███▍      | 34/100 [06:50<13:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  35%|███▌      | 35/100 [07:02<12:56, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 7 texts:  36%|███▌      | 36/100 [07:14<12:41, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 7 texts:  37%|███▋      | 37/100 [07:26<12:29, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.37it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 7 texts:  38%|███▊      | 38/100 [07:38<12:24, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 7 texts:  39%|███▉      | 39/100 [07:50<12:13, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 7 texts:  40%|████      | 40/100 [08:02<12:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 7 texts:  41%|████      | 41/100 [08:14<11:46, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 7 texts:  42%|████▏     | 42/100 [08:26<11:32, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 7 texts:  43%|████▎     | 43/100 [08:38<11:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 7 texts:  44%|████▍     | 44/100 [08:50<11:17, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 7 texts:  45%|████▌     | 45/100 [09:02<11:03, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 7 texts:  46%|████▌     | 46/100 [09:14<10:49, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 7 texts:  47%|████▋     | 47/100 [09:26<10:41, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.43it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  48%|████▊     | 48/100 [09:39<10:30, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 7 texts:  49%|████▉     | 49/100 [09:51<10:21, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  50%|█████     | 50/100 [10:03<10:08, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 7 texts:  51%|█████     | 51/100 [10:15<09:55, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 7 texts:  52%|█████▏    | 52/100 [10:27<09:41, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 7 texts:  53%|█████▎    | 53/100 [10:39<09:25, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.82it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 7 texts:  54%|█████▍    | 54/100 [10:51<09:09, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.81it/s][A[A

Layer 7 texts:  55%|█████▌    | 55/100 [11:03<08:57, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 7 texts:  56%|█████▌    | 56/100 [11:15<08:49, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 7 texts:  57%|█████▋    | 57/100 [11:27<08:42, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 7 texts:  58%|█████▊    | 58/100 [11:39<08:29, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.10s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.57it/s][A[A

Layer 7 texts:  59%|█████▉    | 59/100 [11:52<08:19, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 7 texts:  60%|██████    | 60/100 [12:04<08:05, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 7 texts:  61%|██████    | 61/100 [12:16<07:57, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 7 texts:  62%|██████▏   | 62/100 [12:29<07:47, 12.31s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 7 texts:  63%|██████▎   | 63/100 [12:41<07:31, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.05it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s][A[A

Layer 7 texts:  64%|██████▍   | 64/100 [12:53<07:17, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.46it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.07it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.81it/s][A[A

Layer 7 texts:  65%|██████▌   | 65/100 [13:05<07:04, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.58it/s][A[A

Layer 7 texts:  66%|██████▌   | 66/100 [13:17<06:53, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 7 texts:  67%|██████▋   | 67/100 [13:29<06:41, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 7 texts:  68%|██████▊   | 68/100 [13:41<06:29, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 7 texts:  69%|██████▉   | 69/100 [13:54<06:17, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  70%|███████   | 70/100 [14:06<06:02, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  71%|███████   | 71/100 [14:17<05:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  72%|███████▏  | 72/100 [14:29<05:35, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 7 texts:  73%|███████▎  | 73/100 [14:41<05:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 7 texts:  74%|███████▍  | 74/100 [14:53<05:10, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 7 texts:  75%|███████▌  | 75/100 [15:05<04:58, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 7 texts:  76%|███████▌  | 76/100 [15:17<04:47, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.37it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 7 texts:  77%|███████▋  | 77/100 [15:30<04:39, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 7 texts:  78%|███████▊  | 78/100 [15:42<04:27, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 7 texts:  79%|███████▉  | 79/100 [15:54<04:15, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s][A[A

Layer 7 texts:  80%|████████  | 80/100 [16:06<04:02, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.83it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 7 texts:  81%|████████  | 81/100 [16:18<03:49, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.03it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.75it/s][A[A

Layer 7 texts:  82%|████████▏ | 82/100 [16:30<03:37, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  83%|████████▎ | 83/100 [16:42<03:25, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 7 texts:  84%|████████▍ | 84/100 [16:55<03:14, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 7 texts:  85%|████████▌ | 85/100 [17:07<03:02, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 7 texts:  86%|████████▌ | 86/100 [17:19<02:48, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.09s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.58it/s][A[A

Layer 7 texts:  87%|████████▋ | 87/100 [17:31<02:37, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.64it/s][A[A

Layer 7 texts:  88%|████████▊ | 88/100 [17:43<02:25, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 7 texts:  89%|████████▉ | 89/100 [17:55<02:14, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 7 texts:  90%|█████████ | 90/100 [18:08<02:02, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 7 texts:  91%|█████████ | 91/100 [18:20<01:49, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 7 texts:  92%|█████████▏| 92/100 [18:31<01:36, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 7 texts:  93%|█████████▎| 93/100 [18:43<01:24, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][A[A

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 13.48it/s][A[A


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.06s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.61it/s][A[A

Layer 7 texts:  94%|█████████▍| 94/100 [18:56<01:12, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 7 texts:  95%|█████████▌| 95/100 [19:07<01:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 7 texts:  96%|█████████▌| 96/100 [19:20<00:48, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.92it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 7 texts:  97%|█████████▋| 97/100 [19:32<00:36, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.87it/s][A[A

Layer 7 texts:  98%|█████████▊| 98/100 [19:44<00:24, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.07it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A[A

Layer 7 texts:  99%|█████████▉| 99/100 [19:56<00:12, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.56it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 7 texts: 100%|██████████| 100/100 [20:08<00:00, 12.05s/it][A
Processing layers:  32%|███▏      | 8/25 [2:40:37<5:42:21, 1208.32s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 7: 12.83GB

📊 Processing Layer 8
   📥 Loading SAE Layer 8: layer_8/width_16k/canonical



Layer 8 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.73it/s][A[A

Layer 8 texts:   1%|          | 1/100 [00:12<19:57, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts:   2%|▏         | 2/100 [00:24<19:48, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 8 texts:   3%|▎         | 3/100 [00:36<19:37, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 54.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 8 texts:   4%|▍         | 4/100 [00:48<19:29, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 8 texts:   5%|▌         | 5/100 [01:01<19:31, 12.33s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 8 texts:   6%|▌         | 6/100 [01:13<19:16, 12.30s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 8 texts:   7%|▋         | 7/100 [01:25<18:54, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s][A[A

Layer 8 texts:   8%|▊         | 8/100 [01:37<18:40, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.26s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.39it/s][A[A

Layer 8 texts:   9%|▉         | 9/100 [01:49<18:33, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 8 texts:  10%|█         | 10/100 [02:01<18:11, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  11%|█         | 11/100 [02:14<18:10, 12.25s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.37it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  12%|█▏        | 12/100 [02:26<17:55, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  13%|█▎        | 13/100 [02:38<17:32, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 8 texts:  14%|█▍        | 14/100 [02:50<17:17, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.16s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.49it/s][A[A

Layer 8 texts:  15%|█▌        | 15/100 [03:02<17:18, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 8 texts:  16%|█▌        | 16/100 [03:14<16:57, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 8 texts:  17%|█▋        | 17/100 [03:26<16:43, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 8 texts:  18%|█▊        | 18/100 [03:38<16:31, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  19%|█▉        | 19/100 [03:51<16:23, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 8 texts:  20%|██        | 20/100 [04:03<16:12, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.45it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 8 texts:  21%|██        | 21/100 [04:15<15:57, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  22%|██▏       | 22/100 [04:27<15:37, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.74it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.70it/s][A[A

Layer 8 texts:  23%|██▎       | 23/100 [04:39<15:26, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s][A[A

Layer 8 texts:  24%|██▍       | 24/100 [04:51<15:13, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.02s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.68it/s][A[A

Layer 8 texts:  25%|██▌       | 25/100 [05:03<15:06, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.14it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  26%|██▌       | 26/100 [05:15<14:53, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 8 texts:  27%|██▋       | 27/100 [05:27<14:39, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 8 texts:  28%|██▊       | 28/100 [05:39<14:24, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 8 texts:  29%|██▉       | 29/100 [05:51<14:07, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.60it/s][A[A

Layer 8 texts:  30%|███       | 30/100 [06:03<13:57, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.69it/s][A[A

Layer 8 texts:  31%|███       | 31/100 [06:15<13:45, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts:  32%|███▏      | 32/100 [06:27<13:34, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  33%|███▎      | 33/100 [06:39<13:26, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 8 texts:  34%|███▍      | 34/100 [06:51<13:14, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  35%|███▌      | 35/100 [07:03<13:02, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  36%|███▌      | 36/100 [07:15<12:44, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  37%|███▋      | 37/100 [07:27<12:33, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts:  38%|███▊      | 38/100 [07:39<12:25, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  39%|███▉      | 39/100 [07:51<12:14, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  40%|████      | 40/100 [08:03<12:02, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  41%|████      | 41/100 [08:15<11:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.93it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.28it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  42%|████▏     | 42/100 [08:27<11:39, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 8 texts:  43%|████▎     | 43/100 [08:39<11:27, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 8 texts:  44%|████▍     | 44/100 [08:52<11:23, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 8 texts:  45%|████▌     | 45/100 [09:04<11:12, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 52.83it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 8 texts:  46%|████▌     | 46/100 [09:16<10:56, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 49.84it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 8 texts:  47%|████▋     | 47/100 [09:28<10:39, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  48%|████▊     | 48/100 [09:40<10:24, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.57it/s][A[A

Layer 8 texts:  49%|████▉     | 49/100 [09:52<10:13, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  50%|█████     | 50/100 [10:04<10:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 8 texts:  51%|█████     | 51/100 [10:16<09:53, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  52%|█████▏    | 52/100 [10:29<09:45, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 8 texts:  53%|█████▎    | 53/100 [10:41<09:37, 12.28s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 8 texts:  54%|█████▍    | 54/100 [10:53<09:20, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  55%|█████▌    | 55/100 [11:05<09:05, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 8 texts:  56%|█████▌    | 56/100 [11:17<08:52, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 8 texts:  57%|█████▋    | 57/100 [11:29<08:37, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.09it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.81it/s][A[A

Layer 8 texts:  58%|█████▊    | 58/100 [11:41<08:25, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.57it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.01it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.71it/s][A[A

Layer 8 texts:  59%|█████▉    | 59/100 [11:53<08:14, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 8 texts:  60%|██████    | 60/100 [12:05<07:59, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 8 texts:  61%|██████    | 61/100 [12:17<07:51, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.29it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s][A[A

Layer 8 texts:  62%|██████▏   | 62/100 [12:29<07:39, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 8 texts:  63%|██████▎   | 63/100 [12:42<07:32, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 8 texts:  64%|██████▍   | 64/100 [12:54<07:21, 12.28s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 8 texts:  65%|██████▌   | 65/100 [13:06<07:06, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 8 texts:  66%|██████▌   | 66/100 [13:18<06:52, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.14s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.51it/s][A[A

Layer 8 texts:  67%|██████▋   | 67/100 [13:30<06:41, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  68%|██████▊   | 68/100 [13:42<06:27, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  69%|██████▉   | 69/100 [13:55<06:16, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  70%|███████   | 70/100 [14:07<06:07, 12.25s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 55.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 8 texts:  71%|███████   | 71/100 [14:20<05:56, 12.28s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A


⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])



Layer 8 texts:  72%|███████▏  | 72/100 [14:32<05:43, 12.25s/it][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 8 texts:  73%|███████▎  | 73/100 [14:44<05:29, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 8 texts:  74%|███████▍  | 74/100 [14:56<05:18, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 8 texts:  75%|███████▌  | 75/100 [15:08<05:05, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 8 texts:  76%|███████▌  | 76/100 [15:20<04:53, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 8 texts:  77%|███████▋  | 77/100 [15:33<04:40, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 53.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 8 texts:  78%|███████▊  | 78/100 [15:45<04:30, 12.29s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 8 texts:  79%|███████▉  | 79/100 [15:58<04:18, 12.33s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 8 texts:  80%|████████  | 80/100 [16:09<04:03, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 8 texts:  81%|████████  | 81/100 [16:21<03:50, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.03it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.75it/s][A[A

Layer 8 texts:  82%|████████▏ | 82/100 [16:34<03:39, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 8 texts:  83%|████████▎ | 83/100 [16:46<03:25, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  84%|████████▍ | 84/100 [16:58<03:12, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.45it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts:  85%|████████▌ | 85/100 [17:09<03:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 45.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 8 texts:  86%|████████▌ | 86/100 [17:21<02:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.05it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s][A[A

Layer 8 texts:  87%|████████▋ | 87/100 [17:34<02:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s][A[A

Layer 8 texts:  88%|████████▊ | 88/100 [17:46<02:24, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.40it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  89%|████████▉ | 89/100 [17:58<02:12, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 8 texts:  90%|█████████ | 90/100 [18:10<02:00, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  91%|█████████ | 91/100 [18:22<01:49, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 8 texts:  92%|█████████▏| 92/100 [18:34<01:36, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.74it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  93%|█████████▎| 93/100 [18:46<01:24, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 52.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.02s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s][A[A

Layer 8 texts:  94%|█████████▍| 94/100 [18:58<01:12, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.82it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.13it/s][A[A

Layer 8 texts:  95%|█████████▌| 95/100 [19:10<01:00, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 8 texts:  96%|█████████▌| 96/100 [19:22<00:48, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 8 texts:  97%|█████████▋| 97/100 [19:34<00:36, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 8 texts:  98%|█████████▊| 98/100 [19:47<00:24, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 8 texts:  99%|█████████▉| 99/100 [19:59<00:12, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 8 texts: 100%|██████████| 100/100 [20:11<00:00, 12.16s/it][A
Processing layers:  36%|███▌      | 9/25 [3:00:49<5:22:33, 1209.58s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 8: 12.83GB

📊 Processing Layer 9
   📥 Loading SAE Layer 9: layer_9/width_16k/canonical



Layer 9 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 9 texts:   1%|          | 1/100 [00:12<20:16, 12.29s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:   2%|▏         | 2/100 [00:24<20:11, 12.36s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 9 texts:   3%|▎         | 3/100 [00:37<19:58, 12.35s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 9 texts:   4%|▍         | 4/100 [00:49<19:39, 12.29s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 57.93it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 9 texts:   5%|▌         | 5/100 [01:01<19:15, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.12s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.55it/s][A[A

Layer 9 texts:   6%|▌         | 6/100 [01:13<19:11, 12.26s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.06it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s][A[A

Layer 9 texts:   7%|▋         | 7/100 [01:25<18:53, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 9 texts:   8%|▊         | 8/100 [01:37<18:36, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:   9%|▉         | 9/100 [01:49<18:27, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  10%|█         | 10/100 [02:02<18:16, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:  11%|█         | 11/100 [02:14<18:06, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  12%|█▏        | 12/100 [02:26<17:51, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.02s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.68it/s][A[A

Layer 9 texts:  13%|█▎        | 13/100 [02:38<17:44, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A[A

Layer 9 texts:  14%|█▍        | 14/100 [02:50<17:28, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 9 texts:  15%|█▌        | 15/100 [03:02<17:09, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 9 texts:  16%|█▌        | 16/100 [03:15<16:59, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 9 texts:  17%|█▋        | 17/100 [03:27<16:53, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 9 texts:  18%|█▊        | 18/100 [03:39<16:45, 12.26s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 9 texts:  19%|█▉        | 19/100 [03:51<16:30, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.24it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 9 texts:  20%|██        | 20/100 [04:04<16:15, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 9 texts:  21%|██        | 21/100 [04:16<15:57, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 9 texts:  22%|██▏       | 22/100 [04:27<15:40, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.22it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 9 texts:  23%|██▎       | 23/100 [04:39<15:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.45it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 9 texts:  24%|██▍       | 24/100 [04:52<15:18, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 9 texts:  25%|██▌       | 25/100 [05:04<15:06, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.14it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  26%|██▌       | 26/100 [05:16<14:58, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.10s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.56it/s][A[A

Layer 9 texts:  27%|██▋       | 27/100 [05:28<14:47, 12.16s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 9 texts:  28%|██▊       | 28/100 [05:40<14:37, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 9 texts:  29%|██▉       | 29/100 [05:52<14:20, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 9 texts:  30%|███       | 30/100 [06:05<14:13, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.04it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 9 texts:  31%|███       | 31/100 [06:17<14:03, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 9 texts:  32%|███▏      | 32/100 [06:30<13:58, 12.32s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 9 texts:  33%|███▎      | 33/100 [06:42<13:40, 12.25s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.04it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:  34%|███▍      | 34/100 [06:54<13:22, 12.15s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 9 texts:  35%|███▌      | 35/100 [07:06<13:13, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 9 texts:  36%|███▌      | 36/100 [07:18<13:03, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 9 texts:  37%|███▋      | 37/100 [07:30<12:47, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 58.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 9 texts:  38%|███▊      | 38/100 [07:43<12:37, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 9 texts:  39%|███▉      | 39/100 [07:55<12:19, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:  40%|████      | 40/100 [08:07<12:05, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 9 texts:  41%|████      | 41/100 [08:18<11:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 9 texts:  42%|████▏     | 42/100 [08:31<11:41, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 9 texts:  43%|████▎     | 43/100 [08:43<11:26, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:  44%|████▍     | 44/100 [08:55<11:12, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 9 texts:  45%|████▌     | 45/100 [09:06<10:58, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:  46%|████▌     | 46/100 [09:18<10:44, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.30it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 9 texts:  47%|████▋     | 47/100 [09:30<10:34, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 9 texts:  48%|████▊     | 48/100 [09:42<10:24, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 9 texts:  49%|████▉     | 49/100 [09:55<10:18, 12.14s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 9 texts:  50%|█████     | 50/100 [10:07<10:03, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 9 texts:  51%|█████     | 51/100 [10:19<09:50, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 9 texts:  52%|█████▏    | 52/100 [10:30<09:32, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.64it/s][A[A

Layer 9 texts:  53%|█████▎    | 53/100 [10:42<09:20, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 9 texts:  54%|█████▍    | 54/100 [10:54<09:09, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 9 texts:  55%|█████▌    | 55/100 [11:06<09:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  56%|█████▌    | 56/100 [11:19<08:51, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  57%|█████▋    | 57/100 [11:31<08:41, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  58%|█████▊    | 58/100 [11:43<08:24, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 9 texts:  59%|█████▉    | 59/100 [11:55<08:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.07it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s][A[A

Layer 9 texts:  60%|██████    | 60/100 [12:07<08:00, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.27it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.09it/s][A[A

Layer 9 texts:  61%|██████    | 61/100 [12:19<07:50, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 9 texts:  62%|██████▏   | 62/100 [12:31<07:39, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 9 texts:  63%|██████▎   | 63/100 [12:43<07:30, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.55it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 9 texts:  64%|██████▍   | 64/100 [12:56<07:19, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.93it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 9 texts:  65%|██████▌   | 65/100 [13:08<07:06, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.17s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.50it/s][A[A

Layer 9 texts:  66%|██████▌   | 66/100 [13:20<06:54, 12.19s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.08it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.82it/s][A[A

Layer 9 texts:  67%|██████▋   | 67/100 [13:32<06:41, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 9 texts:  68%|██████▊   | 68/100 [13:44<06:27, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.63it/s][A[A

Layer 9 texts:  69%|██████▉   | 69/100 [13:57<06:18, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.91it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 9 texts:  70%|███████   | 70/100 [14:09<06:07, 12.26s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:  71%|███████   | 71/100 [14:21<05:54, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 9 texts:  72%|███████▏  | 72/100 [14:33<05:41, 12.21s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.24it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:  73%|███████▎  | 73/100 [14:46<05:30, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.51it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  74%|███████▍  | 74/100 [14:58<05:17, 12.22s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 9 texts:  75%|███████▌  | 75/100 [15:10<05:04, 12.17s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.83it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:01<00:01,  1.12s/it][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.55it/s][A[A

Layer 9 texts:  76%|███████▌  | 76/100 [15:22<04:53, 12.23s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  77%|███████▋  | 77/100 [15:34<04:40, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 9 texts:  78%|███████▊  | 78/100 [15:46<04:26, 12.12s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.61it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 9 texts:  79%|███████▉  | 79/100 [15:59<04:16, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 9 texts:  80%|████████  | 80/100 [16:11<04:04, 12.24s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 9 texts:  81%|████████  | 81/100 [16:23<03:51, 12.20s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 9 texts:  82%|████████▏ | 82/100 [16:35<03:39, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 9 texts:  83%|████████▎ | 83/100 [16:47<03:27, 12.18s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 9 texts:  84%|████████▍ | 84/100 [16:59<03:14, 12.13s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  85%|████████▌ | 85/100 [17:11<03:01, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.43it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 9 texts:  86%|████████▌ | 86/100 [17:23<02:48, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  87%|████████▋ | 87/100 [17:35<02:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.32it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 9 texts:  88%|████████▊ | 88/100 [17:47<02:24, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 9 texts:  89%|████████▉ | 89/100 [17:59<02:12, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 9 texts:  90%|█████████ | 90/100 [18:12<02:00, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 9 texts:  91%|█████████ | 91/100 [18:24<01:48, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:  92%|█████████▏| 92/100 [18:35<01:35, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 9 texts:  93%|█████████▎| 93/100 [18:47<01:23, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 9 texts:  94%|█████████▍| 94/100 [18:59<01:11, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 9 texts:  95%|█████████▌| 95/100 [19:11<01:00, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 9 texts:  96%|█████████▌| 96/100 [19:24<00:48, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 9 texts:  97%|█████████▋| 97/100 [19:36<00:36, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.55it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 9 texts:  98%|█████████▊| 98/100 [19:48<00:24, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 9 texts:  99%|█████████▉| 99/100 [20:00<00:12, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 9 texts: 100%|██████████| 100/100 [20:12<00:00, 12.02s/it][A
Processing layers:  40%|████      | 10/25 [3:21:02<5:02:40, 1210.68s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 9: 12.83GB

📊 Processing Layer 10
   📥 Loading SAE Layer 10: layer_10/width_16k/canonical



Layer 10 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.40it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 10 texts:   1%|          | 1/100 [00:11<19:40, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 10 texts:   2%|▏         | 2/100 [00:24<19:38, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 10 texts:   3%|▎         | 3/100 [00:35<19:16, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 10 texts:   4%|▍         | 4/100 [00:47<19:00, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:   5%|▌         | 5/100 [00:59<19:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:   6%|▌         | 6/100 [01:11<18:45, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 10 texts:   7%|▋         | 7/100 [01:23<18:38, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.84it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 10 texts:   8%|▊         | 8/100 [01:35<18:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:   9%|▉         | 9/100 [01:47<18:10, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.88it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 10 texts:  10%|█         | 10/100 [01:59<17:56, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 10 texts:  11%|█         | 11/100 [02:11<17:49, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s][A[A

Layer 10 texts:  12%|█▏        | 12/100 [02:24<17:43, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 10 texts:  13%|█▎        | 13/100 [02:36<17:30, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  14%|█▍        | 14/100 [02:48<17:19, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  15%|█▌        | 15/100 [03:00<16:59, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.98it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  16%|█▌        | 16/100 [03:11<16:45, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.92it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 10 texts:  17%|█▋        | 17/100 [03:24<16:38, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 10 texts:  18%|█▊        | 18/100 [03:36<16:22, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 10 texts:  19%|█▉        | 19/100 [03:47<16:09, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:  20%|██        | 20/100 [03:59<15:54, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:  21%|██        | 21/100 [04:11<15:43, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.71it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 10 texts:  22%|██▏       | 22/100 [04:23<15:35, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.24it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:  23%|██▎       | 23/100 [04:35<15:22, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 10 texts:  24%|██▍       | 24/100 [04:47<15:09, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.04it/s][A[A

Layer 10 texts:  25%|██▌       | 25/100 [04:59<14:54, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 10 texts:  26%|██▌       | 26/100 [05:11<14:41, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  27%|██▋       | 27/100 [05:23<14:30, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.03it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 10 texts:  28%|██▊       | 28/100 [05:35<14:21, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.28it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:  29%|██▉       | 29/100 [05:47<14:12, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 10 texts:  30%|███       | 30/100 [05:59<13:57, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  31%|███       | 31/100 [06:11<13:46, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 10 texts:  32%|███▏      | 32/100 [06:23<13:33, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.06it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  33%|███▎      | 33/100 [06:35<13:19, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  34%|███▍      | 34/100 [06:47<13:09, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 10 texts:  35%|███▌      | 35/100 [06:59<12:56, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  36%|███▌      | 36/100 [07:11<12:48, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  37%|███▋      | 37/100 [07:23<12:35, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 10 texts:  38%|███▊      | 38/100 [07:35<12:22, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 10 texts:  39%|███▉      | 39/100 [07:47<12:08, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.38it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 10 texts:  40%|████      | 40/100 [07:59<11:59, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.04it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 10 texts:  41%|████      | 41/100 [08:11<11:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 10 texts:  42%|████▏     | 42/100 [08:23<11:34, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  43%|████▎     | 43/100 [08:35<11:21, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  44%|████▍     | 44/100 [08:47<11:11, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  45%|████▌     | 45/100 [08:59<11:01, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 10 texts:  46%|████▌     | 46/100 [09:11<10:49, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  47%|████▋     | 47/100 [09:23<10:33, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.43it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 10 texts:  48%|████▊     | 48/100 [09:34<10:21, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 10 texts:  49%|████▉     | 49/100 [09:47<10:11, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  50%|█████     | 50/100 [09:59<10:01, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 10 texts:  51%|█████     | 51/100 [10:11<09:49, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 10 texts:  52%|█████▏    | 52/100 [10:23<09:34, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.45it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:  53%|█████▎    | 53/100 [10:35<09:28, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 10 texts:  54%|█████▍    | 54/100 [10:47<09:16, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  55%|█████▌    | 55/100 [10:59<09:04, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:  56%|█████▌    | 56/100 [11:11<08:49, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 10 texts:  57%|█████▋    | 57/100 [11:23<08:39, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.01it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 10 texts:  58%|█████▊    | 58/100 [11:35<08:28, 12.11s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  59%|█████▉    | 59/100 [11:47<08:15, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.90it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  60%|██████    | 60/100 [11:59<08:03, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  61%|██████    | 61/100 [12:12<07:51, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 10 texts:  62%|██████▏   | 62/100 [12:24<07:39, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 10 texts:  63%|██████▎   | 63/100 [12:36<07:27, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.52it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 10 texts:  64%|██████▍   | 64/100 [12:48<07:14, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  65%|██████▌   | 65/100 [13:00<07:01, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 10 texts:  66%|██████▌   | 66/100 [13:12<06:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 10 texts:  67%|██████▋   | 67/100 [13:24<06:34, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 10 texts:  68%|██████▊   | 68/100 [13:36<06:24, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 10 texts:  69%|██████▉   | 69/100 [13:48<06:12, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 10 texts:  70%|███████   | 70/100 [14:00<05:59, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.14it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 10 texts:  71%|███████   | 71/100 [14:12<05:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.14it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 10 texts:  72%|███████▏  | 72/100 [14:24<05:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.21it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 10 texts:  73%|███████▎  | 73/100 [14:36<05:24, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 10 texts:  74%|███████▍  | 74/100 [14:48<05:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 10 texts:  75%|███████▌  | 75/100 [14:59<04:58, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A
Layer 10 texts:  83%|████████▎ | 83/100 [16:36<03:25, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 10 texts:  84%|████████▍ | 84/100 [16:48<03:13, 12.10s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  85%|████████▌ | 85/100 [17:00<03:01, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 10 texts:  86%|████████▌ | 86/100 [17:12<02:49, 12.09s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.74it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:  87%|████████▋ | 87/100 [17:24<02:37, 12.08s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.41it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  88%|████████▊ | 88/100 [17:36<02:24, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 10 texts:  89%|████████▉ | 89/100 [17:48<02:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 10 texts:  90%|█████████ | 90/100 [18:00<02:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 10 texts:  91%|█████████ | 91/100 [18:12<01:48, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.33it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 10 texts:  92%|█████████▏| 92/100 [18:24<01:35, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 10 texts:  93%|█████████▎| 93/100 [18:36<01:23, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 10 texts:  94%|█████████▍| 94/100 [18:47<01:11, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 10 texts:  95%|█████████▌| 95/100 [18:59<00:59, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.03it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.24it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 10 texts:  96%|█████████▌| 96/100 [19:11<00:47, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.34it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 10 texts:  97%|█████████▋| 97/100 [19:23<00:35, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 10 texts:  98%|█████████▊| 98/100 [19:35<00:23, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 10 texts:  99%|█████████▉| 99/100 [19:47<00:11, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.80it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 10 texts: 100%|██████████| 100/100 [19:59<00:00, 12.01s/it][A
Processing layers:  44%|████▍     | 11/25 [3:41:03<4:41:47, 1207.70s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 10: 12.83GB

📊 Processing Layer 11
   📥 Loading SAE Layer 11: layer_11/width_16k/canonical



Layer 11 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:   1%|          | 1/100 [00:11<19:46, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.15it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 11 texts:   2%|▏         | 2/100 [00:24<19:36, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.48it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 11 texts:   3%|▎         | 3/100 [00:35<19:17, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.86it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 11 texts:   4%|▍         | 4/100 [00:47<19:06, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:   5%|▌         | 5/100 [00:59<18:51, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.70it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 11 texts:   6%|▌         | 6/100 [01:11<18:37, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:   7%|▋         | 7/100 [01:23<18:22, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.12it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 11 texts:   8%|▊         | 8/100 [01:35<18:20, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 11 texts:   9%|▉         | 9/100 [01:47<18:10, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 11 texts:  10%|█         | 10/100 [01:59<17:58, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.05it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 11 texts:  11%|█         | 11/100 [02:11<17:44, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  12%|█▏        | 12/100 [02:23<17:36, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  13%|█▎        | 13/100 [02:35<17:24, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.25it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 11 texts:  14%|█▍        | 14/100 [02:47<17:10, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 11 texts:  15%|█▌        | 15/100 [02:59<16:56, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 11 texts:  16%|█▌        | 16/100 [03:11<16:44, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 11 texts:  17%|█▋        | 17/100 [03:23<16:35, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 11 texts:  18%|█▊        | 18/100 [03:35<16:23, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 11 texts:  19%|█▉        | 19/100 [03:47<16:15, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.22it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 11 texts:  20%|██        | 20/100 [03:59<16:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  21%|██        | 21/100 [04:11<15:50, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.36it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 11 texts:  22%|██▏       | 22/100 [04:23<15:40, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.04it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  23%|██▎       | 23/100 [04:35<15:23, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 11 texts:  24%|██▍       | 24/100 [04:47<15:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 11 texts:  25%|██▌       | 25/100 [04:59<15:04, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  26%|██▌       | 26/100 [05:11<14:48, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  27%|██▋       | 27/100 [05:23<14:34, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 11 texts:  28%|██▊       | 28/100 [05:35<14:23, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 11 texts:  29%|██▉       | 29/100 [05:47<14:14, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.74it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  30%|███       | 30/100 [05:59<14:02, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.76it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 11 texts:  31%|███       | 31/100 [06:11<13:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 11 texts:  32%|███▏      | 32/100 [06:23<13:33, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.42it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 11 texts:  33%|███▎      | 33/100 [06:35<13:21, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.75it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 11 texts:  34%|███▍      | 34/100 [06:47<13:09, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  35%|███▌      | 35/100 [06:59<12:57, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 11 texts:  36%|███▌      | 36/100 [07:11<12:45, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  37%|███▋      | 37/100 [07:23<12:32, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.18it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 11 texts:  38%|███▊      | 38/100 [07:35<12:20, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.74it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 11 texts:  39%|███▉      | 39/100 [07:47<12:09, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  40%|████      | 40/100 [07:59<11:59, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 11 texts:  41%|████      | 41/100 [08:11<11:44, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 11 texts:  42%|████▏     | 42/100 [08:23<11:31, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  43%|████▎     | 43/100 [08:35<11:22, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  44%|████▍     | 44/100 [08:47<11:10, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 11 texts:  45%|████▌     | 45/100 [08:59<10:57, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 11 texts:  46%|████▌     | 46/100 [09:11<10:46, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.31it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 11 texts:  47%|████▋     | 47/100 [09:22<10:31, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 11 texts:  48%|████▊     | 48/100 [09:34<10:22, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.57it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  49%|████▉     | 49/100 [09:47<10:12, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 11 texts:  50%|█████     | 50/100 [09:59<10:02, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  51%|█████     | 51/100 [10:11<09:50, 12.05s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.94it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 11 texts:  52%|█████▏    | 52/100 [10:22<09:33, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.05it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  53%|█████▎    | 53/100 [10:34<09:20, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.27it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 11 texts:  54%|█████▍    | 54/100 [10:46<09:10, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.49it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  55%|█████▌    | 55/100 [10:58<08:58, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.50it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  56%|█████▌    | 56/100 [11:10<08:46, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 11 texts:  57%|█████▋    | 57/100 [11:22<08:34, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.23it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 11 texts:  58%|█████▊    | 58/100 [11:34<08:23, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  59%|█████▉    | 59/100 [11:46<08:13, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.58it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 11 texts:  60%|██████    | 60/100 [11:59<08:01, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  61%|██████    | 61/100 [12:10<07:48, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.43it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  62%|██████▏   | 62/100 [12:22<07:35, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.05it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  63%|██████▎   | 63/100 [12:35<07:25, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.99it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  64%|██████▍   | 64/100 [12:46<07:11, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.83it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 11 texts:  65%|██████▌   | 65/100 [12:58<07:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.60it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  66%|██████▌   | 66/100 [13:11<06:49, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 11 texts:  67%|██████▋   | 67/100 [13:23<06:37, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.03it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 11 texts:  68%|██████▊   | 68/100 [13:35<06:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.55it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.10it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s][A[A

Layer 11 texts:  69%|██████▉   | 69/100 [13:47<06:13, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.07it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 11 texts:  70%|███████   | 70/100 [13:59<05:59, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 11 texts:  71%|███████   | 71/100 [14:11<05:47, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 11 texts:  72%|███████▏  | 72/100 [14:22<05:35, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.10it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 11 texts:  73%|███████▎  | 73/100 [14:34<05:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.13it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s][A[A

Layer 11 texts:  74%|███████▍  | 74/100 [14:47<05:13, 12.07s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.17it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  75%|███████▌  | 75/100 [14:59<05:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 11 texts:  76%|███████▌  | 76/100 [15:10<04:47, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  77%|███████▋  | 77/100 [15:22<04:34, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 11 texts:  78%|███████▊  | 78/100 [15:34<04:22, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.35it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.89it/s][A[A

Layer 11 texts:  79%|███████▉  | 79/100 [15:46<04:11, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.12it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s][A[A

Layer 11 texts:  80%|████████  | 80/100 [15:58<03:59, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  81%|████████  | 81/100 [16:10<03:46, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.02it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  82%|████████▏ | 82/100 [16:22<03:34, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.68it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 11 texts:  83%|████████▎ | 83/100 [16:34<03:22, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.30it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  84%|████████▍ | 84/100 [16:46<03:10, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.89it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 11 texts:  85%|████████▌ | 85/100 [16:58<02:59, 11.94s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.22it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  86%|████████▌ | 86/100 [17:10<02:46, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.85it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  87%|████████▋ | 87/100 [17:22<02:35, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.88it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  88%|████████▊ | 88/100 [17:34<02:23, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.00it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  89%|████████▉ | 89/100 [17:45<02:10, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 11 texts:  90%|█████████ | 90/100 [17:58<01:59, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.19it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 11 texts:  91%|█████████ | 91/100 [18:10<01:48, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.93it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s][A[A

Layer 11 texts:  92%|█████████▏| 92/100 [18:22<01:35, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.46it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 11 texts:  93%|█████████▎| 93/100 [18:33<01:23, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 11 texts:  94%|█████████▍| 94/100 [18:45<01:11, 11.87s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.40it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 11 texts:  95%|█████████▌| 95/100 [18:57<00:59, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.82it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 11 texts:  96%|█████████▌| 96/100 [19:09<00:47, 11.91s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 11 texts:  97%|█████████▋| 97/100 [19:21<00:35, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.56it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 11 texts:  98%|█████████▊| 98/100 [19:33<00:23, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.44it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 11 texts:  99%|█████████▉| 99/100 [19:45<00:11, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 11 texts: 100%|██████████| 100/100 [19:57<00:00, 11.95s/it][A
Processing layers:  48%|████▊     | 12/25 [4:01:01<4:21:02, 1204.83s/it]

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
   Memory after layer 11: 12.83GB

📊 Processing Layer 12
   📥 Loading SAE Layer 12: layer_12/width_16k/canonical



Layer 12 texts:   0%|          | 0/100 [00:00<?, ?it/s][A

📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.63it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.15it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s][A[A

Layer 12 texts:   1%|          | 1/100 [00:11<19:44, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.47it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 12 texts:   2%|▏         | 2/100 [00:24<19:39, 12.04s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s][A[A

Layer 12 texts:   3%|▎         | 3/100 [00:36<19:26, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 12 texts:   4%|▍         | 4/100 [00:47<19:08, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.54it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 12 texts:   5%|▌         | 5/100 [00:59<18:50, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.08it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 12 texts:   6%|▌         | 6/100 [01:11<18:40, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 12 texts:   7%|▋         | 7/100 [01:23<18:29, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.65it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 12 texts:   8%|▊         | 8/100 [01:35<18:18, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.67it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.01it/s][A[A

Layer 12 texts:   9%|▉         | 9/100 [01:47<18:13, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.53it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 12 texts:  10%|█         | 10/100 [01:59<17:58, 11.98s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 12 texts:  11%|█         | 11/100 [02:11<17:48, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.11it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 12 texts:  12%|█▏        | 12/100 [02:23<17:35, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.72it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:  13%|█▎        | 13/100 [02:35<17:19, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.87it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.14it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s][A[A

Layer 12 texts:  14%|█▍        | 14/100 [02:47<17:10, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.11it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.87it/s][A[A

Layer 12 texts:  15%|█▌        | 15/100 [02:59<17:00, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.97it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s][A[A

Layer 12 texts:  16%|█▌        | 16/100 [03:11<16:53, 12.06s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.26it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 12 texts:  17%|█▋        | 17/100 [03:23<16:32, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.73it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:  18%|█▊        | 18/100 [03:35<16:17, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.64it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:  19%|█▉        | 19/100 [03:47<16:06, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:  20%|██        | 20/100 [03:59<15:50, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.09it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 12 texts:  21%|██        | 21/100 [04:11<15:38, 11.88s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.78it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:  22%|██▏       | 22/100 [04:23<15:29, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.79it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.21it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 12 texts:  23%|██▎       | 23/100 [04:34<15:18, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.77it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.18it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s][A[A

Layer 12 texts:  24%|██▍       | 24/100 [04:46<15:04, 11.90s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.74it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 12 texts:  25%|██▌       | 25/100 [04:58<14:49, 11.86s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.81it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s][A[A

Layer 12 texts:  26%|██▌       | 26/100 [05:10<14:39, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.20it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 12 texts:  27%|██▋       | 27/100 [05:22<14:33, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.13it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.17it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s][A[A

Layer 12 texts:  28%|██▊       | 28/100 [05:34<14:19, 11.93s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.69it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.22it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  2.00it/s][A[A

Layer 12 texts:  29%|██▉       | 29/100 [05:46<14:09, 11.97s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.16it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.16it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s][A[A

Layer 12 texts:  30%|███       | 30/100 [05:58<14:00, 12.01s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 64.62it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 12 texts:  31%|███       | 31/100 [06:10<13:45, 11.96s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 60.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 12 texts:  32%|███▏      | 32/100 [06:22<13:37, 12.02s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.37it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 12 texts:  33%|███▎      | 33/100 [06:34<13:23, 11.99s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.96it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.97it/s][A[A

Layer 12 texts:  34%|███▍      | 34/100 [06:46<13:11, 12.00s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 59.95it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.25it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s][A[A

Layer 12 texts:  35%|███▌      | 35/100 [06:58<12:54, 11.92s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 63.39it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.19it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s][A[A

Layer 12 texts:  36%|███▌      | 36/100 [07:10<12:41, 11.89s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.66it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.20it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s][A[A

Layer 12 texts:  37%|███▋      | 37/100 [07:22<12:32, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.59it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.23it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.03it/s][A[A

Layer 12 texts:  38%|███▊      | 38/100 [07:34<12:21, 11.95s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 62.22it/s]


Activations shape: torch.Size([64, 2304])
📥 Loading model: google/paligemma2-3b-pt-224




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s][A[A

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.07it/s][A[A

Layer 12 texts:  39%|███▉      | 39/100 [07:46<12:14, 12.03s/it][A

⚠️  No logits available for google/paligemma2-3b-pt-224, using zero loss
Activations shape: torch.Size([64, 2304])
📥 Loading model: google/gemma-2-2b




Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 61.59it/s]
