In [1]:
from pinecone import Pinecone

In [1]:
import torch
print(f"Current PyTorch version: {torch.__version__}")

# Check if you need to upgrade
if torch.__version__ < "2.6.0":
    print("⚠️ PyTorch version is too old. Please upgrade to 2.6.0 or later")
    print("Run: pip install torch>=2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
else:
    print("✅ PyTorch version is compatible")

Current PyTorch version: 2.7.0+cu118
✅ PyTorch version is compatible


In [1]:
import torch
import time
import numpy as np
from PIL import Image
from transformers import ViTImageProcessor, ViTForImageClassification
import matplotlib.pyplot as plt
import requests
from io import BytesIO
from checkcheck import FastImageCaptioner

captioner = FastImageCaptioner()

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


🚀 Loading Image Captioner on cuda
🎮 GPU: NVIDIA GeForce RTX 4080
📥 Loading Salesforce/blip-image-captioning-base...
🔥 Warming up...
✅ Ready for captioning!


In [2]:
# Single image captioning (lowest latency)

# result = captioner.caption_single_image('test_image.jpg')
# print(result['caption'])  # "a brown dog sitting in a park"

# Batch captioning (highest throughput)  
image_list = ['D:\\keep\\htx\\fast_search\\output_snapshots\\bus\\bus_1_1748446721005.jpg', 'D:\\keep\\htx\\fast_search\\output_snapshots\\car\\car_1_P_track_1_1748446754067.jpg', 'D:\\keep\\htx\\fast_search\\output_snapshots\\person\\person_1_1748446721204.jpg']
results = captioner.caption_batch(image_list, batch_size=16)

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔄 Captioning 3 images in batches of 16
   Batch 1: 59.39ms per image

📝 Captioning Results:
   1. bus_1_1748446721005.jpg: 'a bus is seen in this surveillance image'
   2. car_1_P_track_1_1748446754067.jpg: 'a car is seen in this surveillance image'
   3. person_1_1748446721204.jpg: 'a man is seen in this surveillance image'

📊 Performance Summary:
   Average per image: 59.39ms
   Throughput: 16.8 captions/second


In [3]:
import os
import glob
import json

# Specify your directory here
snapshots_dir = "D:\\keep\\htx\\fast_search\\output_snapshots"  # Change this to your directory path

# Get all image files recursively from all subdirectories
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff', '*.webp']

image_files = []
for root, dirs, files in os.walk(snapshots_dir):
    for ext in image_extensions:
        pattern = os.path.join(root, ext)
        image_files.extend(glob.glob(pattern))
        # Also check uppercase extensions
        pattern_upper = os.path.join(root, ext.upper())
        image_files.extend(glob.glob(pattern_upper))

print(f"📁 Found {len(image_files)} images in '{snapshots_dir}' and subdirectories")
print(f"📂 Subdirectories found: {[d for d in os.listdir(snapshots_dir) if os.path.isdir(os.path.join(snapshots_dir, d))]}")

# Show first few files as preview
if image_files:
    print("\n📋 First few files:")
    for i, file in enumerate(image_files[:5]):
        print(f"   {i+1}. {os.path.basename(file)} ({os.path.dirname(file)})")
    if len(image_files) > 5:
        print(f"   ... and {len(image_files) - 5} more files")

📁 Found 154 images in 'D:\keep\htx\fast_search\output_snapshots' and subdirectories
📂 Subdirectories found: ['boat', 'bus', 'car', 'person', 'screenshots']

📋 First few files:
   1. boat_2_P_track_2_1748446755331.jpg (D:\keep\htx\fast_search\output_snapshots\boat)
   2. boat_2_P_track_2_1748446755331.jpg (D:\keep\htx\fast_search\output_snapshots\boat)
   3. bus_1_1748446721005.jpg (D:\keep\htx\fast_search\output_snapshots\bus)
   4. bus_1_track_23_1748446721434.jpg (D:\keep\htx\fast_search\output_snapshots\bus)
   5. bus_1_track_52_1748446728465.jpg (D:\keep\htx\fast_search\output_snapshots\bus)
   ... and 149 more files


In [4]:
# Alternative: Batch processing for maximum speed
if len(image_files) > 0:
    print("🚀 Starting BATCH captioning process...")
    
    # Process in batches for maximum throughput
    batch_results = captioner.caption_batch(image_files, batch_size=8)
    
    # Format results with filenames
    formatted_results = []
    for result in batch_results['results']:
        formatted_result = {
            'filename': os.path.basename(result['image_path']),
            'full_path': result['image_path'],
            'caption': result['caption'],
            'batch_time_ms': result['batch_time_ms']
        }
        formatted_results.append(formatted_result)
        
        # Print each result
        filename = os.path.basename(result['image_path'])
        print(f"✅ {filename}: '{result['caption'][:60]}...'")
    
    # Statistics
    print(f"\n📊 Batch Captioning Complete!")
    print(f"   Total images: {len(formatted_results)}")
    print(f"   Average time per image: {batch_results['avg_time_per_image_ms']:.2f}ms")
    print(f"   Throughput: {batch_results['throughput_fps']:.1f} images/second")
    
    # Save results
    with open("batch_captions.json", 'w', encoding='utf-8') as f:
        json.dump(formatted_results, f, indent=2, ensure_ascii=False)
    
    print("💾 Results saved to 'batch_captions.json'")

🚀 Starting BATCH captioning process...
🔄 Captioning 154 images in batches of 8


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 1: 62.03ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 2: 25.78ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 3: 24.73ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 4: 23.82ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 5: 24.14ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 6: 26.94ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 7: 28.75ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 8: 25.25ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 9: 23.51ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 10: 26.25ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 11: 26.70ms per image
   Batch 12: 29.22ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 13: 24.26ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 14: 25.45ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 15: 26.61ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 16: 25.40ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 17: 29.26ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 18: 24.82ms per image


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Batch 19: 23.83ms per image
   Batch 20: 71.63ms per image

📝 Captioning Results:
   1. boat_2_P_track_2_1748446755331.jpg: 'surveillance video of a man in a gas station'
   2. boat_2_P_track_2_1748446755331.jpg: 'surveillance video of a man in a gas station'
   3. bus_1_1748446721005.jpg: 'a bus is seen in this surveillance image'
   4. bus_1_track_23_1748446721434.jpg: 'a green bus is seen in this surveillance image'
   5. bus_1_track_52_1748446728465.jpg: 'a bus is driving down the street in the city'
   6. bus_1_1748446721005.jpg: 'a bus is seen in this surveillance image'
   7. bus_1_track_23_1748446721434.jpg: 'a green bus is seen in this surveillance image'
   8. bus_1_track_52_1748446728465.jpg: 'a bus is driving down the street in the city'
   9. car_1_P_track_1_1748446754067.jpg: 'a car is seen in this surveillance image'
   10. car_1_P_track_49_1748446754882.jpg: 'the suspect is seen in this surveillance photo'
   11. car_1_P_track_50_1748446755048.jpg: 'a car is seen in 

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch
# Moondream2 implementation
model_id = "vikhyatk/moondream2"
revision = "2024-08-26"

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    revision=revision,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="cuda:0"
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

# Optimized inference
def caption_with_moondream(image_path, question="Describe this image."):
    image = Image.open(image_path)
    enc_image = model.encode_image(image)
    
    return model.answer_question(enc_image, question, tokenizer)

PhiForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [2]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
# Qwen2-VL optimized loading
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.float16,
    device_map="cuda:0"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

def qwen_inference(image, text_query):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": text_query},
            ],
        }
    ]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    inputs = processor(
        text=[text], 
        images=[image], 
        return_tensors="pt"
    ).to("cuda:0")
    
    with torch.inference_mode():
        output = model.generate(**inputs, max_new_tokens=256)
    
    return processor.decode(output[0], skip_special_tokens=True)

Fetching 5 files: 100%|██████████| 5/5 [02:38<00:00, 31.76s/it] 

Loading checkpoint shards: 100%|██████████| 5/5 [00:22<00:00,  4.54s/it]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save

In [4]:
from PIL import Image

image = Image.open(r"D:\keep\htx\fast_search\output_snapshots\screenshots\screenshot_1_aa7d625f\scene_frame4522_1749009793.jpg")
qwen_inference(image, "what do you see in this image?")

'system\nYou are a helpful assistant.\nuser\nwhat do you see in this image?\nassistant\nThis image shows a bus stop at night. There are several people waiting for the bus, and a few are standing on the sidewalk. The bus stop has a sign with bus route numbers and destinations. The road is visible on the left side of the image, and there are some traffic cones and barriers on the sidewalk. The timestamp on the image indicates it was taken on February 23, 2024, at 14:46.'

In [5]:
qwen_inference(image, "what do you see in this image?")

'system\nYou are a helpful assistant.\nuser\nwhat do you see in this image?\nassistant\nThis image shows a bus stop at night. There are several people waiting for the bus, and a few are standing on the sidewalk. The bus stop has a sign with bus route numbers and destinations. The road is visible on the left side of the image, and there are some traffic cones and barriers on the sidewalk. The timestamp on the image indicates it was taken on February 23, 2024, at 14:46.'

In [None]:
# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("unum-cloud/uform-gen")

In [1]:
from pinecone import Pinecone