In [2]:
# ---------------------------------------------------------------------
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# ---------------------------------------------------------------------
from __future__ import annotations

from collections.abc import Sequence
from pathlib import Path
from typing import Callable

import torch
from PIL.Image import Image

from qai_hub_models.models.protocols import ExecutableModelProtocol
from qai_hub_models.utils.asset_loaders import load_image


class ClipApp:
    """
    This class consists of light-weight "app code" that is required to perform end to end inference with Clip.

    The app uses 1 model:
        * Clip

    For a given image input, the app will:
        * pre-process the image
        * pre-process the text
        * Run Clip inference
    """

    def __init__(
        self,
        # Model has two inputs:
        #  - image (N, 3, H, W), RGB, float[0:1]
        #  - tokenized text (N, 77)
        model: ExecutableModelProtocol[torch.Tensor],
        text_tokenizer: Callable[[str], torch.Tensor],
        image_preprocessor: Callable[[Image], torch.Tensor],
    ):
        self.model = model
        self.text_tokenizer = text_tokenizer
        self.image_preprocessor = image_preprocessor

    def predict(self, *args, **kwargs):
        # See predict_similarity.
        return self.predict_similarity(*args, **kwargs)

    def predict_similarity(
        self, images_or_image_paths: Sequence[Image | str | Path], texts: Sequence[str]
    ) -> torch.Tensor:
        """
        Inputs:
            images_or_image_paths: PIL Image or path to an image file / URL.
            texts: String texts to search for similarity.

        Outputs:
            cosine_similarities_per_image: torch.Tensor (Shape: [num_images, num_text_prompts])
                Given a batch of images and a batch of text tokens, returns a tensor,
                containing the cosine similarity scores corresponding to each image per text input.
                The values are cosine similarities between the corresponding image and
                text features, times 100. The cosine similarities of text per image can be computed
                by doing a transpose.
        """
        preprocessed_images: list[torch.Tensor] = []

        # Process each image to be a tensor  of shape [NImages, 3, 224, 224] with layout RGB and range [0 - 1 ]
        for image_or_path in images_or_image_paths:
            if isinstance(image_or_path, str) or isinstance(image_or_path, Path):
                image_or_path = load_image(image_or_path)
            preprocessed_images.append(self.image_preprocessor(image_or_path))
        preprocessed_stacked_images = torch.stack(preprocessed_images)

        # Tokenize string text to shape [NTexts, 77]
        preprocessed_texts: list[torch.Tensor] = [self.text_tokenizer(x) for x in texts]
        preprocessed_stacked_texts = torch.cat(preprocessed_texts)

        return self.model(preprocessed_stacked_images, preprocessed_stacked_texts)

In [3]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from pathlib import Path
from typing import List, Tuple, Dict
import torch.nn.functional as F

# Enhanced ClipApp with classification methods
class ClipClassifier:
    def __init__(self, clip_app: ClipApp):
        self.clip_app = clip_app
    
    def classify_single_image(self, image_path: str, class_labels: List[str]) -> Dict:
        """
        Classify a single image against multiple text labels
        Returns the most likely class with confidence scores
        """
        similarities = self.clip_app.predict_similarity([image_path], class_labels)
        
        # Get probabilities using softmax
        probabilities = F.softmax(similarities[0] / 100.0, dim=0)  # Divide by 100 since similarities are scaled
        
        # Get the best match
        best_idx = similarities[0].argmax().item()
        best_class = class_labels[best_idx]
        best_score = similarities[0, best_idx].item()
        best_probability = probabilities[best_idx].item()
        
        # Create results dictionary
        results = {
            'predicted_class': best_class,
            'confidence_score': best_score,
            'probability': best_probability,
            'all_scores': {label: score.item() for label, score in zip(class_labels, similarities[0])},
            'all_probabilities': {label: prob.item() for label, prob in zip(class_labels, probabilities)}
        }
        
        return results
    
    def classify_multiple_images(self, image_paths: List[str], class_labels: List[str]) -> List[Dict]:
        """
        Classify multiple images against the same set of class labels
        """
        similarities = self.clip_app.predict_similarity(image_paths, class_labels)
        
        results = []
        for i, image_path in enumerate(image_paths):
            # Get probabilities for this image
            probabilities = F.softmax(similarities[i] / 100.0, dim=0)
            
            # Get the best match for this image
            best_idx = similarities[i].argmax().item()
            best_class = class_labels[best_idx]
            best_score = similarities[i, best_idx].item()
            best_probability = probabilities[best_idx].item()
            
            # Create results for this image
            image_results = {
                'image_path': image_path,
                'predicted_class': best_class,
                'confidence_score': best_score,
                'probability': best_probability,
                'all_scores': {label: score.item() for label, score in zip(class_labels, similarities[i])},
                'all_probabilities': {label: prob.item() for label, prob in zip(class_labels, probabilities)}
            }
            
            results.append(image_results)
        
        return results
    
    def get_top_k_predictions(self, image_path: str, class_labels: List[str], k: int = 3) -> List[Dict]:
        """
        Get top-k predictions for a single image
        """
        similarities = self.clip_app.predict_similarity([image_path], class_labels)
        probabilities = F.softmax(similarities[0] / 100.0, dim=0)
        
        # Get top-k indices
        top_k_indices = similarities[0].topk(k).indices
        
        top_predictions = []
        for idx in top_k_indices:
            idx = idx.item()
            top_predictions.append({
                'class': class_labels[idx],
                'confidence_score': similarities[0, idx].item(),
                'probability': probabilities[idx].item()
            })
        
        return top_predictions

# Your existing functions (fixed versions)
def simple_tokenizer(text: str) -> torch.Tensor:
    """Simple tokenizer that creates a tensor of shape [1, 77]"""
    tokens = torch.randint(0, 1000, (1, 77))
    return tokens

def image_preprocessor(image: Image.Image) -> torch.Tensor:
    """Preprocesses image to tensor of shape [3, 224, 224]"""
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    tensor = transform(image)
    return tensor

class MockClipModel:
    def __call__(self, images: torch.Tensor, texts: torch.Tensor) -> torch.Tensor:
        return torch.randn(images.shape[0], texts.shape[0]) * 100

# Initialize everything
clip_app = ClipApp(
    model=MockClipModel(),
    text_tokenizer=simple_tokenizer,
    image_preprocessor=image_preprocessor
)

# Create classifier
classifier = ClipClassifier(clip_app)

def test_classification():
    # Define your image and possible classes
    # image_path = '/Users/satvik/Documents/GitHub/HaQathon/screenshot.png'
    # image_path = r'/Users/satvik/Documents/GitHub/HaQathon/chess.png'
    image_path = '/Users/satvik/Documents/GitHub/HaQathon/camera.png'  # Replace with your image path
    
    # Define your classification labels
    class_labels = [
        "screen with code",
        "playing games",
        "distracted by phone",
    ]
    
    try:
        print("=== Single Image Classification ===")
        
        # Classify single image
        result = classifier.classify_single_image(image_path, class_labels)
        
        print(f"Predicted class: {result['predicted_class']}")
        print(f"Confidence score: {result['confidence_score']:.2f}")
        print(f"Probability: {result['probability']:.2%}")
        
        print("\n=== All Class Scores ===")
        for label, score in result['all_scores'].items():
            prob = result['all_probabilities'][label]
            print(f"{label}: {score:.2f} (probability: {prob:.2%})")
        
        print("\n=== Top 3 Predictions ===")
        top_3 = classifier.get_top_k_predictions(image_path, class_labels, k=3)
        for i, pred in enumerate(top_3, 1):
            print(f"{i}. {pred['class']}: {pred['confidence_score']:.2f} ({pred['probability']:.2%})")
        
        # Example with multiple images
        print("\n=== Multiple Images Classification ===")
        multiple_images = [image_path]  # Add more image paths here
        multiple_results = classifier.classify_multiple_images(multiple_images, class_labels)
        
        for result in multiple_results:
            print(f"Image: {result['image_path']}")
            print(f"Predicted: {result['predicted_class']} ({result['probability']:.2%})")
            print()
            
    except Exception as e:
        print(f"Error during classification: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    test_classification()

=== Single Image Classification ===
Predicted class: screen with code
Confidence score: 50.70
Probability: 57.79%

=== All Class Scores ===
screen with code: 50.70 (probability: 57.79%)
playing games: -85.38 (probability: 14.82%)
distracted by phone: -23.98 (probability: 27.39%)

=== Top 3 Predictions ===
1. playing games: 199.63 (76.49%)
2. distracted by phone: 58.04 (18.56%)
3. screen with code: -74.10 (4.95%)

=== Multiple Images Classification ===
Image: /Users/satvik/Documents/GitHub/HaQathon/camera.png
Predicted: screen with code (57.64%)



In [4]:
while True:
    test_classification()

=== Single Image Classification ===
Predicted class: screen with code
Confidence score: 104.73
Probability: 62.22%

=== All Class Scores ===
screen with code: 104.73 (probability: 62.22%)
playing games: -168.71 (probability: 4.04%)
distracted by phone: 43.52 (probability: 33.74%)

=== Top 3 Predictions ===
1. playing games: 4.98 (84.21%)
2. screen with code: -220.12 (8.87%)
3. distracted by phone: -244.86 (6.92%)

=== Multiple Images Classification ===
Image: /Users/satvik/Documents/GitHub/HaQathon/camera.png
Predicted: playing games (45.07%)

=== Single Image Classification ===
Predicted class: screen with code
Confidence score: 64.54
Probability: 44.12%

=== All Class Scores ===
screen with code: 64.54 (probability: 44.12%)
playing games: 63.17 (probability: 43.52%)
distracted by phone: -62.77 (probability: 12.35%)

=== Top 3 Predictions ===
1. screen with code: 78.76 (78.30%)
2. playing games: -103.62 (12.64%)
3. distracted by phone: -136.85 (9.06%)

=== Multiple Images Classificati

KeyboardInterrupt: 