In [None]:
!pip install -q bitsandbytes==0.43.0 transformers accelerate pillow

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import time
import torch
import requests
from PIL import Image
from transformers import AutoModel, AutoProcessor, BitsAndBytesConfig
try:
    from google.colab import userdata
    is_colab = True
except ImportError:
    is_colab = False
import os
import gc

# Verify GPU is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Model: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️ No GPU detected! Please enable GPU in Colab.")
    print("Go to: Runtime > Change runtime type > Hardware accelerator > GPU")
    print("Then restart the runtime: Runtime > Restart runtime")

# Verify bitsandbytes installation
try:
    import bitsandbytes as bnb
    print(f"✅ bitsandbytes version: {bnb.__version__}")
except ImportError:
    print("❌ bitsandbytes not installed correctly")
    print("Try installing with: !pip install bitsandbytes==0.43.0")

# Retrieve Hugging Face token (optional)
if is_colab:
    try:
        os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
        print("HF_TOKEN found and set")
    except:
        print("HF_TOKEN not found, proceeding without it.")

# Function to measure memory usage
def get_memory_usage():
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / 1024**2  # MB
    return 0

# Function to clear memory
def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Input prompt (text-only)
prompt = "What is the meaning of life?"

# Optional: Image input (uncomment to use)
"""
image_url = "https://picsum.photos/512"
try:
    response = requests.get(image_url, stream=True)
    image = Image.open(response.raw).convert("RGB")
except Exception as e:
    print(f"Error loading image: {str(e)}")
    exit(1)
prompt = "Describe the image."
"""

# Create cache directory if it doesn't exist
os.makedirs("/content/model_cache", exist_ok=True)

# --- FP16 Setup ---
print("\n=== MiniCPM-V 2.6 FP16 ===")
fp16_load_start = time.time()

try:
    fp16_model = AutoModel.from_pretrained(
        "openbmb/MiniCPM-V-2_6",
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        cache_dir="/content/model_cache",
        trust_remote_code=True
    )
    fp16_processor = AutoProcessor.from_pretrained(
        "openbmb/MiniCPM-V-2_6",
        cache_dir="/content/model_cache",
        trust_remote_code=True
    )

    fp16_load_time = time.time() - fp16_load_start
    print(f"FP16 Loading Time: {fp16_load_time:.2f} seconds")

    # Inference
    fp16_inf_start = time.time()
    inputs = fp16_processor(text=prompt, images=None, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = fp16_model.generate(
            **inputs,
            max_new_tokens=30,
            do_sample=False,
            num_beams=1
        )
    fp16_output = fp16_processor.decode(outputs[0], skip_special_tokens=True)
    fp16_inf_latency = time.time() - fp16_inf_start
    fp16_memory = get_memory_usage()
    fp16_tokens = len(outputs[0]) - inputs["input_ids"].shape[1]
    fp16_throughput = fp16_tokens / fp16_inf_latency if fp16_inf_latency > 0 else 0.0

    print("Output:", fp16_output)
    print(f"Inference Latency: {fp16_inf_latency:.2f} seconds")
    print(f"Throughput: {fp16_throughput:.2f} tokens/second")
    print(f"Memory Usage: {fp16_memory:.2f} MB")

except Exception as e:
    print(f"Error in FP16 setup: {str(e)}")
    fp16_output = "Failed"
    fp16_load_time = fp16_inf_latency = fp16_throughput = fp16_memory = 0.0

# Clear GPU memory
try:
    del fp16_model
    del fp16_processor
except:
    pass
clear_memory()

# --- Quantized Setup (4-bit, bitsandbytes) ---
print("\n=== MiniCPM-V 2.6 Quantized (4-bit, bitsandbytes) ===")
quant_load_start = time.time()

try:
    # Make sure bitsandbytes is imported
    import bitsandbytes as bnb

    # Define quantization configuration
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True
    )

    # Load quantized model
    quant_model = AutoModel.from_pretrained(
        "openbmb/MiniCPM-V-2_6",
        quantization_config=quant_config,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        cache_dir="/content/model_cache",
        trust_remote_code=True
    )

    # Load processor
    quant_processor = AutoProcessor.from_pretrained(
        "openbmb/MiniCPM-V-2_6",
        cache_dir="/content/model_cache",
        trust_remote_code=True
    )

    quant_load_time = time.time() - quant_load_start
    print(f"Quantized Loading Time: {quant_load_time:.2f} seconds")

    # Inference
    quant_inf_start = time.time()
    inputs = quant_processor(text=prompt, images=None, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = quant_model.generate(
            **inputs,
            max_new_tokens=30,
            do_sample=False,
            num_beams=1
        )
    quant_output = quant_processor.decode(outputs[0], skip_special_tokens=True)
    quant_inf_latency = time.time() - quant_inf_start
    quant_memory = get_memory_usage()
    quant_tokens = len(outputs[0]) - inputs["input_ids"].shape[1]
    quant_throughput = quant_tokens / quant_inf_latency if quant_inf_latency > 0 else 0.0

    print("Output:", quant_output)
    print(f"Inference Latency: {quant_inf_latency:.2f} seconds")
    print(f"Throughput: {quant_throughput:.2f} tokens/second")
    print(f"Memory Usage: {quant_memory:.2f} MB")

except Exception as e:
    print(f"Error in quantized setup: {str(e)}")
    quant_output = "Failed"
    quant_load_time = quant_inf_latency = quant_throughput = quant_memory = 0.0

# Clear GPU memory
try:
    del quant_model
    del quant_processor
except:
    pass
clear_memory()

# Print comparison (if both versions ran successfully)
if fp16_output != "Failed" and quant_output != "Failed":
    print("\n=== Performance Comparison ===")
    print(f"                | FP16      | 4-bit")
    print(f"----------------|-----------|-----------")
    print(f"Loading Time    | {fp16_load_time:.2f}s     | {quant_load_time:.2f}s")
    print(f"Inference Time  | {fp16_inf_latency:.2f}s     | {quant_inf_latency:.2f}s")
    print(f"Throughput      | {fp16_throughput:.2f} t/s | {quant_throughput:.2f} t/s")
    print(f"Memory Usage    | {fp16_memory:.2f} MB  | {quant_memory:.2f} MB")

    # Calculate memory savings
    if fp16_memory > 0:
        memory_savings = (1 - (quant_memory / fp16_memory)) * 100
        print(f"Memory Savings: {memory_savings:.1f}%")

    # Calculate speed impact
    if fp16_throughput > 0:
        speed_impact = ((quant_throughput / fp16_throughput) - 1) * 100
        print(f"Speed Impact: {speed_impact:.1f}% ({'faster' if speed_impact > 0 else 'slower'})")