In [None]:
# !pip install --upgrade git+https://github.com/openai/CLIP.git


In [None]:
import torch
from torchvision.models import resnet50, ResNet50_Weights
import clip
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
rn_torch = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)


In [None]:
model, preprocess = clip.load("ViT-B/32", device=device)


# 2.2.1

ImageNet uses the WordNet hierarchy for organizing its 1000 object categories. WordNet is a lexical database that groups words into synonym sets (synsets) and connects them through various relationships (is-a, part-of, etc.). In ImageNet, each synset corresponds to a specific object category, forming a many-to-one mapping (multiple words in a synset can describe the same object). The hierarchy is a directed acyclic graph (DAG) where each node represents a synset and each edge represents a relationship between synsets. The root node is the synset "entity" and the leaves are the object categories. The hierarchy is used to define the label space for the ImageNet dataset, where each image is labeled with the synset of the object it contains.

# 2.2.2

A synset in WordNet is a set of synonyms that represent the same underlying concept or idea. In the context of ImageNet, a synset refers to a specific object category. For example, the synset "n01531178: terrier" represents the category of terrier dogs.

# 2.2.3

Yes, grouping objects based solely on synsets can lead to challenges in visual recognition for a few reasons:
*  A single word can have multiple meanings (synsets) depending on context. For example, "bat" can refer to the flying mammal (synset for animals) or a baseball bat (synset for sports equipment). ImageNet relies on the surrounding words or image content for disambiguation, which might not always be available.
* Synsets can be broad or narrow, leading to variations in object categories. For example, the synset "n01531178: terrier" is more specific than the synset "n02085620: Chihuahua," which is a type of terrier. This can affect the granularity of object recognition.
* Synsets might group visually distinct objects with similar overall concepts. For instance, the synset "n02121808: golden retriever" might encompass images of golden retrievers in various poses, fur colors, or even with different objects like frisbees.
* Synsets primarily focus on the object category itself and might not capture specific attributes like size, color, or material. These attributes can be crucial for recognizing specific objects within a synset.

# 2.2.4

1. Pose and Viewpoint: Objects within a category can appear in different poses (standing, sitting, lying down) or from various viewpoints (side view, front view, etc.).
2. Lighting and Background: Lighting conditions and background clutter can significantly alter the appearance of objects within a category.
3. Object Attributes: Objects belonging to the same category might exhibit variations in attributes like size, color, or material (e.g., different colored sneakers within the "sneaker" synset).


In [None]:
import requests
import json
url = 'https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt'
class_names = requests.get(url).text.split('\n')

In [None]:
import torchvision.transforms as transforms
from PIL import Image
import torch.nn.functional as F


transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
img_path= '/content/ILSVRC2012_val_00042079.JPEG'
img = Image.open(img_path)
image = preprocess(img).unsqueeze(0).to(device)
text_t = torch.cat([clip.tokenize(f"a {c}") for c in class_names]).to(device)

In [None]:
with torch.no_grad():
    image_features = model.encode_image(image)
    text_inputs = [f"a {class_name}" for class_name in class_names]
    text_input = clip.tokenize(text_inputs).to(device)
    logits_per_image, logits_per_text = model(image, text_input)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

top5_probs, top5_classes = torch.topk(logits_per_image, 5)


for i in range(5):
    print(f"Class: {class_names[top5_classes[0][i]]}, Probability: {top5_probs[0][i].item()}")



Working as expected and is able to predict the correct object category.

In [None]:
from matplotlib import pyplot as plt
from torch.profiler import profile, record_function, ProfilerActivity
def display_image_and_categories(img_path,model = model):
    # Load and preprocess the image
    img = Image.open(img_path)
    image = preprocess(img).unsqueeze(0).to(device)
    img_t = transform(img).unsqueeze(0)


    # Perform the inference
    with torch.no_grad():
        with torch.autocast("cuda"):
          with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
            # with record_function("model_inference"):
                  image_features = model.encode_image(image)
                  text_inputs = [f"a {class_name}" for class_name in class_names]
                  text_input = clip.tokenize(text_inputs).to(device)
                  logits_per_image, logits_per_text = model(image, text_input)
    probs = logits_per_image.softmax(dim=-1).cpu()*100  # Apply softmax

    # Get the top 5 predicted classes
    top5_probs, top5_classes = torch.topk(probs, 5, dim=1)

    # Display the image
    plt.imshow(img)
    plt.axis("off")
    plt.show()

    # Print the top 5 predicted classes and their probabilities
    print("Top 5 predicted categories with CLIP:")
    for i in range(5):
      print(f"{class_names[top5_classes[0][i]]}: {top5_probs[0][i].item():.2f}%")

    rn_torch.eval()
    with torch.no_grad():
        logits_torch = rn_torch(img_t)


    # Print top-5 ImageNet class names
    import torch.nn.functional as F
    probs_torch = F.softmax(logits_torch, dim=1)
    # probs_clip = F.softmax(logits_clip, dim=1)

    top5_torch = torch.topk(probs_torch, 5)
    # top5_clip = torch.topk(probs_clip, 5)

    print('\nTop-5 ImageNet classes for ResNet-50 from torchvision:')
    for idx in top5_torch.indices[0]:
        print(f'{class_names[idx]}: {probs_torch[0, idx]:.2%}')

    return prof


In [None]:
prof=display_image_and_categories(img_path='4.jpg')

In [None]:
display_image_and_categories('3.jpg')

In [None]:
import torch
import time
from torchvision import models
import clip

# Load the pre-trained CLIP RN50 image encoder (FP32)
model, preprocess = clip.load("ViT-B/32", device="cuda")

model_half, preprocess = clip.load("ViT-B/32", device="cuda")
# Convert the model to FP16
model_half.visual = model.visual.half()  

# Sample image for timing (replace with your actual image)
image = preprocess(Image.open("4.jpg")).unsqueeze(0).cuda()


In [None]:

# Function to measure inference time
def measure_inference_time(model, image):
    start_time = time.time()
    with torch.no_grad():
        _ = model.encode_image(image)  
    end_time = time.time()
    return end_time - start_time


In [None]:

# Time the FP32 model (repeat for ~100 times)
fp32_times = []
for _ in range(100):
    fp32_times.append(measure_inference_time(model.float(), image.clone()))


In [None]:

# Time the FP16 model (repeat for ~100 times)
fp16_times = []
for _ in range(100):
    fp16_times.append(measure_inference_time(model, image.clone()))


In [None]:

# Calculate and print results
print(f"FP32 Mean Inference Time: {torch.mean(torch.tensor(fp32_times))}")
print(f"FP32 Inference Time Std Dev: {torch.std(torch.tensor(fp32_times))}")

print(f"FP16 Mean Inference Time: {torch.mean(torch.tensor(fp16_times))}")
print(f"FP16 Inference Time Std Dev: {torch.std(torch.tensor(fp16_times))}")


In [None]:
# Difference in inference time
print(f"Mean Inference Time Speedup: {torch.mean(torch.tensor(fp32_times)) / torch.mean(torch.tensor(fp16_times))}")

In [None]:
# Function to get and compare probabilities
def compare_probabilities(model, images):
    with torch.no_grad():
        # Calculate FP32 probabilities
        fp32_logits = model.encode_image(torch.cat(images))
        fp32_probabilities = fp32_logits.softmax(dim=-1)

        # Calculate FP16 probabilities
        model.half()  # Convert to FP16
        fp16_logits = model.encode_image(torch.cat(images).half())
        fp16_probabilities = fp16_logits.softmax(dim=-1)
        model.float()  # Convert back to FP32 

        # Print differences
        for i in range(len(images)):
            diff = torch.abs(fp32_probabilities[i] - fp16_probabilities[i])
            print(f"Max Probability Difference for Image {i}: {torch.max(diff)}")


In [None]:
images = []
path = 'Images/'
for i in range(3, 6):
    image = preprocess(Image.open(f"{path}{i}.jpg")).unsqueeze(0).cuda()
    images.append(image)


In [None]:

display_image_and_categories('Images/1.jpg')
display_image_and_categories('Images/1.jpg',model_half)

In [None]:

display_image_and_categories('Images/4.jpg')
display_image_and_categories('Images/4.jpg',model_half)

In [None]:

display_image_and_categories('Images/9.jpg')
display_image_and_categories('Images/9.jpg',model_half)

There is a significant difference in the probability scores for the FP32 and FP16 models. The FP16 model has lower probability scores for the correct object category compared to the FP32 model. This difference in scores can be attributed to the reduced precision of the FP16 model, which might affect the model's ability to capture fine-grained details and make accurate predictions. However, since our prediction is almost always within the top-5 predictions, the model is still able to recognize the object category correctly despite the differences in probability scores.

In [None]:
prof1 = display_image_and_categories('Images/9.jpg')
prof2= display_image_and_categories('Images/9.jpg',model_half)

In [None]:
print(prof1.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))


In [None]:
print(prof2.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))


In [None]:
prof1.export_chrome_trace("prof1.json")