In [5]:
import torch
import clip
from PIL import Image
import numpy as np
import json

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, transform = clip.load("RN50", device=device)

# Load ImageNet labels from JSON
with open("/content/drive/MyDrive/imagenet_labels.json", "r") as f:
    classes = json.load(f)

def preprocess_image(image_path):
    image = Image.open(image_path)
    image_input = transform(image).unsqueeze(0).to(device)
    return image_input

def predict_image(image_path, classes, top_k=5):
    image_input = preprocess_image(image_path)
    with torch.no_grad():
        text_features = clip.tokenize(classes).to(device)
        logits_per_image, logits_per_text = model(image_input, text_features)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

        # Display top categories
        top_k_idx = np.argsort(probs.squeeze())[::-1][:top_k]
        for idx in top_k_idx:
            print(f"{classes[idx]}: {probs.squeeze()[idx]:.5f}")

# Example images
example_images = ["/content/drive/MyDrive/Images/banana1.jpg",
    "/content/drive/MyDrive/Images/banana2.jpg",
    "/content/drive/MyDrive/Images/bicycle1.jpg",
    "/content/drive/MyDrive/Images/bicycle2.jpeg",
    "/content/drive/MyDrive/Images/bird1.jpg",
    "/content/drive/MyDrive/Images/bird2.jpg",
    "/content/drive/MyDrive/Images/car1.jpg",
    "/content/drive/MyDrive/Images/car2.jpg",
    "/content/drive/MyDrive/Images/coffeemug1.jpg",
    "/content/drive/MyDrive/Images/coffeemug2.jpg",
    "/content/drive/MyDrive/Images/dog1.jpg",
    "/content/drive/MyDrive/Images/dog2.jpg",
    "/content/drive/MyDrive/Images/fish1.png",
    "/content/drive/MyDrive/Images/fish2.jpg",
    "/content/drive/MyDrive/Images/flower1.jpg",
    "/content/drive/MyDrive/Images/flower2.jpg",
    "/content/drive/MyDrive/Images/guitar1.png",
    "/content/drive/MyDrive/Images/guitar2.png",
    "/content/drive/MyDrive/Images/laptop1.jpg",
    "/content/drive/MyDrive/Images/laptop2.jpg",]

# Test with example images
for image_path in example_images:
    print(f"Predictions for {image_path}:")
    predict_image(image_path, classes)
    print()


Predictions for /content/drive/MyDrive/Images/banana1.jpg:
banana: 0.77447
lemon: 0.03464
slug: 0.03345
butternut squash: 0.01043
tusker: 0.00899

Predictions for /content/drive/MyDrive/Images/banana2.jpg:
banana: 0.96961
lemon: 0.00512
acorn squash: 0.00333
slug: 0.00144
bib: 0.00140

Predictions for /content/drive/MyDrive/Images/bicycle1.jpg:
tandem bicycle: 0.57439
mountain bike: 0.15140
disc brake: 0.05778
unicycle: 0.04666
tricycle: 0.03666

Predictions for /content/drive/MyDrive/Images/bicycle2.jpeg:
mountain bike: 0.75485
tandem bicycle: 0.12317
unicycle: 0.06895
tricycle: 0.01487
mongoose: 0.00670

Predictions for /content/drive/MyDrive/Images/bird1.jpg:
macaw: 0.16383
hen: 0.11998
jacamar: 0.06347
duck: 0.05227
bulbul: 0.04126

Predictions for /content/drive/MyDrive/Images/bird2.jpg:
hen: 0.13121
duck: 0.07141
common gallinule: 0.04753
bulbul: 0.04612
drumstick: 0.04366

Predictions for /content/drive/MyDrive/Images/car1.jpg:
minibus: 0.28072
tow truck: 0.06397
taxicab: 0.0609

In [6]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import json
import numpy as np

# Load ImageNet pretrained ResNet-50 model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)
model.to(device)
model.eval()

# Load ImageNet labels from JSON
with open("/content/drive/MyDrive/archive/imagenet_labels.json", "r") as f:
    classes = json.load(f)

def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image_input = transform(image).unsqueeze(0).to(device)
    return image_input

def predict_image(image_path, classes, top_k=5):
    image_input = preprocess_image(image_path)
    with torch.no_grad():
        outputs = model(image_input)
        probs = torch.nn.functional.softmax(outputs, dim=-1)[0]

        # Display top categories
        top_k_probs, top_k_indices = torch.topk(probs, top_k)
        for prob, idx in zip(top_k_probs, top_k_indices):
            print(f"{classes[idx]}: {prob.item():.5f}")

# Example images
example_images = ["/content/drive/MyDrive/Images/banana1.jpg",
    "/content/drive/MyDrive/Images/banana2.jpg",
    "/content/drive/MyDrive/Images/bicycle1.jpg",
    "/content/drive/MyDrive/Images/bicycle2.jpeg",
    "/content/drive/MyDrive/Images/bird1.jpg",
    "/content/drive/MyDrive/Images/bird2.jpg",
    "/content/drive/MyDrive/Images/car1.jpg",
    "/content/drive/MyDrive/Images/car2.jpg",
    "/content/drive/MyDrive/Images/coffeemug1.jpg",
    "/content/drive/MyDrive/Images/coffeemug2.jpg",
    "/content/drive/MyDrive/Images/dog1.jpg",
    "/content/drive/MyDrive/Images/dog2.jpg",
    "/content/drive/MyDrive/Images/fish1.png",
    "/content/drive/MyDrive/Images/fish2.jpg",
    "/content/drive/MyDrive/Images/flower1.jpg",
    "/content/drive/MyDrive/Images/flower2.jpg",
    "/content/drive/MyDrive/Images/guitar1.png",
    "/content/drive/MyDrive/Images/guitar2.png",
    "/content/drive/MyDrive/Images/laptop1.jpg",
    "/content/drive/MyDrive/Images/laptop2.jpg",]

# Test with example images
for image_path in example_images:
    print(f"Predictions for {image_path}:")
    predict_image(image_path, classes)
    print()


Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip


Predictions for /content/drive/MyDrive/Images/banana1.jpg:
nipple: 0.32817
banana: 0.07244
hook: 0.07063
toilet seat: 0.04974
rocking chair: 0.04221

Predictions for /content/drive/MyDrive/Images/banana2.jpg:
flatworm: 0.49722
banana: 0.31156
sea slug: 0.01801
pitcher: 0.01108
pan flute: 0.00937

Predictions for /content/drive/MyDrive/Images/bicycle1.jpg:
mountain bike: 0.66677
tandem bicycle: 0.13627
tricycle: 0.05243
disc brake: 0.03757
turnstile: 0.01667

Predictions for /content/drive/MyDrive/Images/bicycle2.jpeg:
tricycle: 0.87903
mountain bike: 0.04611
tandem bicycle: 0.02420
moped: 0.02302
unicycle: 0.00658

Predictions for /content/drive/MyDrive/Images/bird1.jpg:
crash helmet: 0.06224
pinwheel: 0.05892
pencil sharpener: 0.04948
barrette: 0.04848
nipple: 0.02645

Predictions for /content/drive/MyDrive/Images/bird2.jpg:
laptop computer: 0.22874
envelope: 0.15189
website: 0.12267
comic book: 0.11740
jigsaw puzzle: 0.03795

Predictions for /content/drive/MyDrive/Images/car1.jpg:
go

| Image | CLIP | Imagenet Pretained RN50 |
|-----------------|-----------------|-----------------|
| Banana1    | banana: 0.77447    | banana: 0.07244   |
| Banana2    | banana: 0.96961   | banana: 0.31156    |
| Bicycle1    | tandem bicycle: 0.57439    | **Not Detected**    |
| Bicycle2    | mountain bike: 0.75485   | tandem bicycle: 0.02420    |
| Bird1    | hen: 0.13121   | **Not Detected**   |
| Bird2    |common gallinule: 0.04753   | **Not Detected**    |
| Car1    | minibus: 0.28072    | **Not Detected**    |
| Car2    | taxicab: 0.15552   | **Not Detected**    |
| Coffeemug1    | coffee mug: 0.28597    | **Not Detected**    |
| Coffeemug2    | coffee mug: 0.36748  | coffee mug: 0.34415    |
| Dog1    | Beagle: 0.16741    | **Not Detected**    |
| Dog2    | Chihuahua: 0.24932    | **Not Detected**    |
| Fish1    | tench: 0.03026    | **Not Detected**   |
| Fish2    | goldfish: 0.55970   | **Not Detected**    |
| Flower1    | **Not Detected**    | **Not Detected**    |
| Flower2    | daisy: 0.15994    | **Not Detected**    |
| Guitar1    | acoustic guitar: 0.91748  | acoustic guitar: 0.99653    |
| Guitar2    | acoustic guitar: 0.76977    | acoustic guitar: 0.94424    |
| Laptop1    |laptop computer: 0.51025    | notebook computer: 0.22355    |
| Laptop2    |laptop computer: 0.50645    | laptop computer: 0.13936    |


(i) Banana and Bicycle are detected better using CLIP but not by ImageNet pretrained RN50.

Reason:
CLIP can understand abstract concepts and relationships between objects depicted in images and text. The cartoon banana might not be recognizable based on visual features alone, but CLIP can leverage textual context to understand its representation.

(ii) Guitar is well detected using ImageNet pretained RN50 than CLIP.


# Part 5

In [1]:
!pip install ftfy


Collecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.2.0


## (i)

In [6]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import time

# Load the original RN50 CLIP image encoder model trained with FP32 precision
model_fp32, preprocess = torch.hub.load('openai/clip', 'RN50')
model_fp32.eval()

# Convert the model's parameters to FP16
model_fp16 = model_fp32.half()

# Define image preprocessing transformations
transform = transforms.Compose([
    transforms.Resize(224),  # Resize image to match model input size
    transforms.CenterCrop(224),  # Crop image to center
    transforms.ToTensor(),  # Convert PIL image to PyTorch tensor
])

# Load and preprocess the image
image_path = '/content/drive/MyDrive/Images/coffeemug2.jpg'
image = Image.open(image_path)
image_tensor = transform(image).unsqueeze(0)  # Add batch dimension

# Move the input tensor to the same device as the model's parameters
device = next(model_fp16.parameters()).device
image_tensor = image_tensor.to(device)

# Measure the time taken to encode an image using the FP16 model
runs = 100
fp16_times = []
for _ in range(runs):
    start_time = time.time()
    with torch.no_grad():
        _ = model_fp16.encode_image(image_tensor)
    end_time = time.time()
    fp16_times.append(end_time - start_time)

# Calculate mean and standard deviation for FP16 model
fp16_mean_time = torch.tensor(fp16_times).mean().item()
fp16_std_time = torch.tensor(fp16_times).std().item()

# Move the input tensor to the same device as the model's parameters for FP32 model
image_tensor = image_tensor.to(next(model_fp32.parameters()).device)

# Measure the time taken to encode an image using the original FP32 model
fp32_times = []
for _ in range(runs):
    start_time = time.time()
    with torch.no_grad():
        _ = model_fp32.encode_image(image_tensor)
    end_time = time.time()
    fp32_times.append(end_time - start_time)

# Calculate mean and standard deviation for FP32 model
fp32_mean_time = torch.tensor(fp32_times).mean().item()
fp32_std_time = torch.tensor(fp32_times).std().item()

# Calculate the difference in mean wall-clock times between the FP32 and FP16 models
time_difference = fp32_mean_time - fp16_mean_time

# Report results
print("FP16 Mean Time: {:.6f} sec (std: {:.6f} sec)".format(fp16_mean_time, fp16_std_time))
print("FP32 Mean Time: {:.6f} sec (std: {:.6f} sec)".format(fp32_mean_time, fp32_std_time))
print("Difference in Mean Time: {:.6f} sec".format(time_difference))


Using cache found in /root/.cache/torch/hub/openai_clip_main


FP16 Mean Time: 0.018815 sec (std: 0.093690 sec)
FP32 Mean Time: 0.011019 sec (std: 0.001439 sec)
Difference in Mean Time: -0.007797 sec


## (ii)

In [13]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import numpy as np

# Load the original RN50 CLIP image encoder model trained with FP32 precision
model_fp32, preprocess = torch.hub.load('openai/clip', 'RN50')
model_fp32.eval()

# Convert the model's parameters to FP16
model_fp16 = model_fp32.half()
model_fp16.to('cuda')  # Move the model to GPU if available

# Define image preprocessing transformations
transform = transforms.Compose([
    transforms.Resize(224),  # Resize image to match model input size
    transforms.CenterCrop(224),  # Crop image to center
    transforms.ToTensor(),  # Convert PIL image to PyTorch tensor
])

# Define classes for the images
classes = ['dog', 'cat', 'car', 'flower', 'bird']

# Load and preprocess the images
image_paths = ['/content/drive/MyDrive/Images/dog1.jpg', '/content/drive/MyDrive/Images/cat.jpeg', '/content/drive/MyDrive/Images/car1.jpg', '/content/drive/MyDrive/Images/flower1.jpg', '/content/drive/MyDrive/Images/bird1.jpg']
images = [Image.open(path) for path in image_paths]
image_tensors = torch.stack([transform(image) for image in images]).to('cuda')  # Move the input tensor to GPU if available

# Calculate probabilities using FP16 model
with torch.no_grad():
    logits_fp16 = model_fp16.encode_image(image_tensors.half())

probabilities_fp16 = torch.nn.functional.softmax(logits_fp16, dim=-1)

# Convert probabilities to numpy arrays for easier comparison
probabilities_fp16 = probabilities_fp16.cpu().numpy()

# Print class labels and corresponding probabilities for FP16 model
for i, class_name in enumerate(classes):
    print(f"Class: {class_name}")
    print(f"FP16 Probabilities: {probabilities_fp16[i]}")
    print()


Using cache found in /root/.cache/torch/hub/openai_clip_main


Class: dog
FP16 Probabilities: [0.000966  0.000996  0.000978  ... 0.0009494 0.000998  0.001164 ]

Class: cat
FP16 Probabilities: [0.0009513 0.001052  0.000946  ... 0.00099   0.000953  0.001201 ]

Class: car
FP16 Probabilities: [0.00093   0.000991  0.0009274 ... 0.000992  0.000992  0.001016 ]

Class: flower
FP16 Probabilities: [0.000952 0.001027 0.000952 ... 0.000962 0.000982 0.001016]

Class: bird
FP16 Probabilities: [0.000981  0.00096   0.000982  ... 0.0009503 0.001046  0.001113 ]



In [14]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import numpy as np

# Load the original RN50 CLIP image encoder model trained with FP32 precision
model_fp32, preprocess = torch.hub.load('openai/clip', 'RN50')
model_fp32.eval()

# Convert the model's parameters to FP16
model_fp16 = model_fp32.half()
model_fp16.to('cuda')  # Move the model to GPU if available

# Define image preprocessing transformations
transform = transforms.Compose([
    transforms.Resize(224),  # Resize image to match model input size
    transforms.CenterCrop(224),  # Crop image to center
    transforms.ToTensor(),  # Convert PIL image to PyTorch tensor
])

# Define classes for the images
classes = ['dog', 'cat', 'car', 'flower', 'bird']

# Load and preprocess the images
image_paths = ['/content/drive/MyDrive/Images/dog1.jpg', '/content/drive/MyDrive/Images/cat.jpeg', '/content/drive/MyDrive/Images/car1.jpg', '/content/drive/MyDrive/Images/flower1.jpg', '/content/drive/MyDrive/Images/bird1.jpg']
images = [Image.open(path) for path in image_paths]
image_tensors = torch.stack([transform(image) for image in images]).to('cuda')  # Move the input tensor to GPU if available

# Calculate probabilities using FP32 model
with torch.no_grad():
    logits_fp32 = model_fp32.encode_image(image_tensors)

probabilities_fp32 = torch.nn.functional.softmax(logits_fp32, dim=-1)

# Convert probabilities to numpy arrays for easier comparison
probabilities_fp32 = probabilities_fp32.cpu().numpy()

# Print class labels and corresponding probabilities for FP32 model
for i, class_name in enumerate(classes):
    print(f"Class: {class_name}")
    print(f"FP32 Probabilities: {probabilities_fp32[i]}")
    print()



Using cache found in /root/.cache/torch/hub/openai_clip_main


Class: dog
FP32 Probabilities: [0.000966  0.000996  0.000978  ... 0.0009494 0.000998  0.001164 ]

Class: cat
FP32 Probabilities: [0.0009513 0.001052  0.000946  ... 0.00099   0.000953  0.001201 ]

Class: car
FP32 Probabilities: [0.00093   0.000991  0.0009274 ... 0.000992  0.000992  0.001016 ]

Class: flower
FP32 Probabilities: [0.000952 0.001027 0.000952 ... 0.000962 0.000982 0.001016]

Class: bird
FP32 Probabilities: [0.000981  0.00096   0.000982  ... 0.0009503 0.001046  0.001113 ]



- There is no much difference between FP32 and FP16
- It seems that there are no significant differences between the probabilities calculated using the FP32 and FP16 models. The maximum absolute difference across all classes is 0.0, indicating that the probabilities are exactly the same for both models.
- This result suggests that converting the model's parameters to FP16 did not introduce any noticeable differences in the output probabilities. However, this may not always be the case with other models or datasets. In some cases, using FP16 may lead to slight differences in the output due to the reduced precision, but in this particular scenario, the differences are negligible.






## (iii)

In [16]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import subprocess

# Function to run nvidia-smi command and parse memory usage
def get_gpu_memory_usage():
    result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'], stdout=subprocess.PIPE)
    memory_usage = int(result.stdout.decode().strip())
    return memory_usage

# Load the original RN50 CLIP image encoder model trained with FP32 precision
model_fp32, preprocess = torch.hub.load('openai/clip', 'RN50')
model_fp32.eval()
model_fp32.to('cuda')  # Move the model to GPU if available

# Convert the model's parameters to FP16
model_fp16 = model_fp32.half()

# Define image preprocessing transformations
transform = transforms.Compose([
    transforms.Resize(224),  # Resize image to match model input size
    transforms.CenterCrop(224),  # Crop image to center
    transforms.ToTensor(),  # Convert PIL image to PyTorch tensor
])

# Define classes for the images
classes = ['dog', 'cat', 'car', 'flower', 'bird']

# Load and preprocess the images
image_paths = ['/content/drive/MyDrive/Images/dog1.jpg', '/content/drive/MyDrive/Images/cat.jpeg', '/content/drive/MyDrive/Images/car1.jpg', '/content/drive/MyDrive/Images/flower1.jpg', '/content/drive/MyDrive/Images/bird1.jpg']
images = [Image.open(path) for path in image_paths]
image_tensors = torch.stack([transform(image) for image in images]).to('cuda')  # Move the input tensor to GPU if available

# Run forward pass with FP32 model and monitor memory usage
print("Memory usage before FP32 forward pass:", get_gpu_memory_usage(), "MiB")
with torch.no_grad():
    logits_fp32 = model_fp32.encode_image(image_tensors)
print("Memory usage after FP32 forward pass:", get_gpu_memory_usage(), "MiB")

# Run forward pass with FP16 model and monitor memory usage
print("Memory usage before FP16 forward pass:", get_gpu_memory_usage(), "MiB")
with torch.no_grad():
    logits_fp16 = model_fp16.encode_image(image_tensors.half())
print("Memory usage after FP16 forward pass:", get_gpu_memory_usage(), "MiB")


Using cache found in /root/.cache/torch/hub/openai_clip_main


Memory usage before FP32 forward pass: 1247 MiB
Memory usage after FP32 forward pass: 1247 MiB
Memory usage before FP16 forward pass: 1247 MiB
Memory usage after FP16 forward pass: 1247 MiB
