In [None]:
!pip install -U bitsandbytes transformers accelerate requests Pillow



In [None]:
import time
import torch
import requests
from PIL import Image
from transformers import PaliGemmaForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from google.colab import userdata
import os

# Retrieve Hugging Face token from Colab Secrets
try:
    os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
except:
    print("Error: HF_TOKEN not found in Colab Secrets. Please set it in the Secrets panel.")
    exit(1)

# Function to measure memory usage
def get_memory_usage():
    return torch.cuda.memory_allocated() / 1024**2  # MB

# Input prompt for image task
prompt = "Describe the image."

# Load an image from a URL
image_url = "https://picsum.photos/512"  # Replace with your image URL
try:
    response = requests.get(image_url, stream=True)
    image = Image.open(response.raw).convert("RGB")
except Exception as e:
    print(f"Error loading image: {str(e)}")
    exit(1)

# Initialize variables
model = None
processor = None

# --- Quantized Setup (4-bit, bitsandbytes) ---
print("=== PaliGemma 3B Quantized (4-bit, bitsandbytes) ===")

# Measure model loading time
load_start_time = time.time()

try:
    # Configure 4-bit quantization
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True
    )

    # Load model and processor with optimizations
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        "google/paligemma-3b-pt-224",
        quantization_config=quant_config,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        cache_dir="/content/model_cache",
        token=os.environ["HF_TOKEN"]
    )
    processor = AutoProcessor.from_pretrained(
        "google/paligemma-3b-pt-224",
        cache_dir="/content/model_cache",
        token=os.environ["HF_TOKEN"]
    )

    load_time = time.time() - load_start_time
    print(f"Model Loading Time: {load_time:.2f} seconds")

except Exception as e:
    print(f"Error in model loading: {str(e)}")
    exit(1)

# Measure inference time
inference_start_time = time.time()

try:
    # Preprocess input (text + image)
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")

    # Generate output with greedy decoding
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=30,
            do_sample=False,
            num_beams=1
        )
    quant_output = processor.decode(outputs[0], skip_special_tokens=True)
    quant_inference_latency = time.time() - inference_start_time
    quant_memory = get_memory_usage()

    # Calculate throughput (based on generated tokens)
    quant_tokens = len(outputs[0]) - inputs["input_ids"].shape[1]
    quant_throughput = quant_tokens / quant_inference_latency if quant_inference_latency > 0 else 0.0

    # Print results
    print("Output:", quant_output)
    print(f"Inference Latency: {quant_inference_latency:.2f} seconds")
    print(f"Throughput: {quant_throughput:.2f} tokens/second")
    print(f"Memory Usage: {quant_memory:.2f} MB")

except Exception as e:
    print(f"Error in inference: {str(e)}")
    quant_output = "Failed"
    quant_inference_latency = quant_throughput = quant_memory = 0.0

# Clear GPU memory
if model is not None:
    try:
        del model
    except:
        pass
if processor is not None:
    try:
        del processor
    except:
        pass
torch.cuda.empty_cache()

=== PaliGemma 3B Quantized (4-bit, bitsandbytes) ===


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


Model Loading Time: 62.68 seconds
Output: Describe the image.
notebook
Inference Latency: 2.10 seconds
Throughput: 0.95 tokens/second
Memory Usage: 3380.03 MB


In [None]:
!pip install -U bitsandbytes

