In [None]:
    import os
    import requests
    from PIL import Image
    import torch
    import clip
    from io import BytesIO
    import torch
    import torchvision.models as models
    import torchvision.transforms as transforms
    from torchvision.models import ResNet50_Weights
    from torchvision import datasets, transforms
    import random
    import matplotlib.pyplot as plt    
    import numpy as np
    from matplotlib import image as mpimg
    import torch.nn.functional as F
    import time
    from tabulate import tabulate
    import subprocess
    import pandas as pd

In [2]:
"""
I downloaded the synset_words.txt from https://github.com/torch/tutorials/blob/master/7_imagenet_classification/synset_words.txt

[PROMPT] : Write a helper function to read this text file and print both the synset and its group's name
"""

def load_synset_words(synset_word_location):
    """
    Just a helper function to read the synset ID and the names 
    """
    synset_to_class = {}

    with open(synset_word_location, "r") as file:
        for line in file:
            if not line.strip():
                continue
            
            parts = line.strip().split(" ", 1)
            
            if len(parts) == 2:
                synset_to_class[parts[0]] = parts[1]
            else:
                print(f"Skipping invalid line: {line.strip()}")
    
    return synset_to_class

In [3]:
synset_word_location = "../../data/interim/3/synset_words.txt"
synset_words = load_synset_words(synset_word_location)


count = 0 

for synset_id, name in synset_words.items():
    print(f"Synset ID: {synset_id}, Name: {name}")
    count = count + 1
    
    if(count>5):
        break


FileNotFoundError: [Errno 2] No such file or directory: '../../data/interim/3/synset_words.txt'

In [None]:
print(f"Number of labels : {len(synset_words)}")

`1. Inference using CLIP.`

In [None]:
def load_resnet_50_weight():
    """
    This function loads the pretrained weights for ResNet-50 
    # Ref : https://pytorch.org/vision/main/models/generated/torchvision.models.resnet50.html
    """
    
    resnet50_imagenet = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
    resnet50_imagenet.eval()

    return resnet50_imagenet

In [None]:
resnet50_imagenet = load_resnet_50_weight()

In [None]:
print(f"Available models in CLIP are : {clip.available_models()}")

In [None]:
def load_clip_model(model):
    """
    This function loads the CLIP and any of its available model 
    # Ref => https://github.com/openai/CLIP?tab=readme-ov-file
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model, preprocess = clip.load(model, device)

    return model, preprocess

In [None]:
clip_RN50 , preprocess = load_clip_model(model="RN50")


### Difference in the Visual Encoder

![image.png](../../data/interim/3/markdown_space.png)


CLIP has two encoders: one for the visual (either ViT or ResNet) and one for the text (using Transformers).

The differences in the encoder architectures between CLIP and ResNet are as follows:

1. **Input Resolution and Normalization**: 
  - Standard ImageNet ResNet-50 expects 224×224 images with specific normalization values.
  - CLIP's vision encoder is designed to handle multiple input resolutions and uses different normalization constants.

2. **Attention Pooling**: 
  - CLIP replaces the standard global average pooling used in ResNet-50 with an attention pooling (where we basically weight the features and then pool them) mechanism.
  - This allows the model to focus on more relevant parts of the image when creating the final representation.

3. **Modified Final Layer**: 
  - In standard ResNet-50, the final layer is a linear classifier that outputs logits for 1000 ImageNet classes.
  - CLIP's vision encoder, on the other hand, outputs a normalized embedding vector (typically 1024 or 2048 dimensions) designed to align with the text embedding space so that the dot product can be computed 


`2. Setup data`

`3. Setup zero-shot CLIP`

`3.1 Testing CLIP on imagenet dataset`

In [None]:
labels = list(synset_words.keys())

print(labels[:5])

In [None]:
def classify_with_clip(model, preprocess, device, image_path, class_labels):
    """
    This function classifies an image with CLIP and returns the top-5 predictions.

    The code is exactly from https://github.com/openai/CLIP?tab=readme-ov-file
    """
    
    image = Image.open(image_path).convert('RGB')
    image_input = preprocess(image).unsqueeze(0).to(device)
    
    text_inputs = torch.cat([clip.tokenize(f"a photo of a {label}") for label in class_labels]).to(device)
    
    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1).squeeze(0)
    
    values, indices = similarity.topk(5)
    
    predictions = [(class_labels[idx.item()], values[i].item()) for i, idx in enumerate(indices)]
    
    return predictions

In [None]:
"""
I downloaded the mini imagenet dataset from https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000/discussion/284032
"""

device = "cuda" if torch.cuda.is_available() else "cpu"

train_dir = "../../data/external/3/imagenet-mini/train"

imagenet_data = datasets.ImageFolder(root=train_dir, transform=preprocess)

synset_ids = imagenet_data.classes  # these are like n017839 .. and not human readable -> so convert to human labels 

In [None]:

human_readable_labels = []

for synset_id in synset_ids:
    if synset_id in synset_words:
        human_readable_labels.append(synset_words[synset_id].split(",")[0])


In [None]:
print(human_readable_labels[:5])

print(f"\nNumber of labels = {len(human_readable_labels)}")

In [None]:
NUMBER_IMAGES = 5 

for idx in range(NUMBER_IMAGES):

    synset_id = synset_ids[idx]  
    synset_dir = os.path.join(train_dir, synset_id)  
    
    image_files = [f for f in os.listdir(synset_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    random_image_file = random.choice(image_files)
    image_path = os.path.join(synset_dir, random_image_file)

    true_label = human_readable_labels[idx]
    
    predictions = classify_with_clip(clip_RN50, preprocess, device, image_path, human_readable_labels)

    print(f"\nTrue label: {true_label} (synset ID: {synset_id})")

    img = mpimg.imread(image_path)
    plt.figure(figsize=(6,6))
    plt.imshow(img)
    plt.axis('off') 
    plt.title(f"True: {true_label}\nTop-5 Predictions")
    plt.show()

    for label, score in predictions:
        print(f"Label: {label}, Score: {score:.4f}")


`3.2 Testing ResNet50 pretrained on imagenet dataset`

In [None]:
"""
This is a standard pre-processing done for imagenet dataset and ref = https://stackoverflow.com/questions/67185623/image-net-preprocessing-using-torch-transforms
"""

resnet_preprocess = transforms.Compose([

    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),

])


In [None]:
def classify_with_resnet50(model, image_path, imagenet_labels , top_k=5):
    """
    This function classifies an image with ResNet50 and returns the top-k predictions.
    """

    image = Image.open(image_path).convert('RGB')
    input_tensor = resnet_preprocess(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model(input_tensor)
    
    probabilities = torch.nn.functional.softmax(output[0], dim=0)
    
    values, indices = torch.topk(probabilities, top_k)
    
    predictions = [(imagenet_labels[idx.item()], values[i].item()) for i, idx in enumerate(indices)]

    return predictions

In [None]:
resnet50_imagenet = resnet50_imagenet.to(device)

imagenet_labels = human_readable_labels

In [None]:

NUMBER_IMAGES = 5

for idx in range(NUMBER_IMAGES):

    synset_id = synset_ids[idx]
    synset_dir = os.path.join(train_dir, synset_id)

    image_files = [f for f in os.listdir(synset_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    random_image_file = random.choice(image_files)
    image_path = os.path.join(synset_dir, random_image_file)
    
    true_label = human_readable_labels[idx]
    
    resnet_predictions = classify_with_resnet50(resnet50_imagenet, image_path, imagenet_labels)

    print(f"\nClassifying with ResNet50")
    print(f"True label: {true_label} (synset ID: {synset_id})")
    
    img = mpimg.imread(image_path)
    plt.figure(figsize=(6,6))
    plt.imshow(img)
    plt.axis('off')  
    plt.title(f"True: {true_label}\nResNet50 Top-5 Predictions")
    plt.show()
    
    for label, score in resnet_predictions:
        print(f"Label: {label}, Score: {score:.4f}")

`4. CLIP vs ImageNet pretraining.`

In [None]:
def get_predictions(resnet50_model, clip_model, preprocess , image_path, imagenet_labels, device):
    """
    Function to get predictions from both ResNet-50 and CLIP models.
    """

    resnet_prediction = classify_with_resnet50(resnet50_model, image_path, imagenet_labels)

    clip_prediction = classify_with_clip(clip_model, preprocess, device, image_path, imagenet_labels)
    
    return resnet_prediction, clip_prediction


In [None]:
"""
Class 1 : Fruits
"""

folder_path = "../../data/interim/3/clip-yes-resnet-no/1"

for filename in os.listdir(folder_path):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, filename)

        # Load image
        img = Image.open(image_path).convert("RGB")
        plt.imshow(img)
        plt.title(f"Image: {filename}")
        plt.axis('off')
        plt.show()

        resnet_prediction, clip_prediction = get_predictions(resnet50_imagenet, clip_RN50, preprocess, image_path, imagenet_labels, device)

        print(f"== Predictions for: {filename} ==")

        for label, score in resnet_prediction:
            print(f"ResNet Label: {label}, Score: {score:.4f}")

        print()

        for label, score in clip_prediction:
            print(f"CLIP Label: {label}, Score: {score:.4f}")
      

        print("\n" + "="*100 + "\n")


In [None]:
synset_to_label = dict(zip(synset_ids, human_readable_labels))

In [None]:
import os
from PIL import Image
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm

# These will store the mismatched image paths
in_resnet_not_clip = []
not_resnet_in_clip = []

# Root directory with synset-style folders
root_folder = "../../data/external/3/imagenet-mini/val"

# Get list of folders first
synset_folders = [f for f in os.listdir(root_folder) if os.path.isdir(os.path.join(root_folder, f))]

for synset_id in tqdm(synset_folders, desc="Processing synsets"):

    class_path = os.path.join(root_folder, synset_id)

    # Get the human-readable label
    true_label = synset_to_label.get(synset_id, "").lower()


    if not true_label:
        continue  # Skip if mapping not found

    image_files = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    for filename in tqdm(image_files, leave=False, desc=f"{synset_id}", 
                         postfix=lambda: {
                             "in_resnet_not_clip": len(in_resnet_not_clip),
                             "not_resnet_in_clip": len(not_resnet_in_clip)
                         }):
        image_path = os.path.join(class_path, filename)

        # Run predictions
        resnet_pred, clip_pred = get_predictions(
            resnet50_imagenet, clip_RN50, preprocess, image_path, imagenet_labels, device
        )

        # Get just the labels
        resnet_labels = [label.lower() for label, _ in resnet_pred]
        clip_labels = [label.lower() for label, _ in clip_pred]


        in_resnet = any(true_label in pred_label for pred_label in resnet_labels)
        in_clip = any(true_label in pred_label for pred_label in clip_labels)

        if in_resnet and not in_clip:
            in_resnet_not_clip.append(image_path)
            print(f"[Updated] in_resnet_not_clip: {len(in_resnet_not_clip)}")
        elif in_clip and not in_resnet:
            not_resnet_in_clip.append(image_path)
            print(f"[Updated] not_resnet_in_clip: {len(not_resnet_in_clip)}")


# Save results to file
with open("in_resnet_not_clip.txt", "w") as f:
    for path in in_resnet_not_clip:
        f.write(f"{path}\n")

with open("not_resnet_in_clip.txt", "w") as f:
    for path in not_resnet_in_clip:
        f.write(f"{path}\n")

print(f"✅ Done! {len(in_resnet_not_clip)} in ResNet but not CLIP")
print(f"✅ Done! {len(not_resnet_in_clip)} in CLIP but not ResNet")


In [None]:
raise KeyboardInterrupt


In [None]:
true_label = human_readable_labels[idx]


import os
from PIL import Image
import matplotlib.pyplot as plt

in_resnet_not_clip = []
not_resnet_in_clip = []

root_folder = "../../data/external/3/imagenet-mini/train/"

for class_name in os.listdir(root_folder):

    class_path = os.path.join(root_folder, class_name)
    if not os.path.isdir(class_path):
        continue

    for filename in os.listdir(class_path):
        
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(class_path, filename)

            # Load image
            img = Image.open(image_path).convert("RGB")

            # Get top-5 predictions from both models
            resnet_pred, clip_pred = get_predictions(
                resnet50_imagenet, clip_RN50, preprocess, image_path, imagenet_labels, device, topk=5
            )

            # Extract just the labels
            resnet_labels = [label.lower() for label, _ in resnet_pred]
            clip_labels = [label.lower() for label, _ in clip_pred]

            true_label = class_name.lower()

            in_resnet = any(true_label in pred_label for pred_label in resnet_labels)
            in_clip = any(true_label in pred_label for pred_label in clip_labels)

            if in_resnet and not in_clip:
                in_resnet_not_clip.append(image_path)
            elif in_clip and not in_resnet:
                not_resnet_in_clip.append(image_path)


In [None]:
"""
Class 2
"""

folder_path = "../../data/interim/3/clip-yes-resnet-no/2"

for filename in os.listdir(folder_path):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(folder_path, filename)

        img = Image.open(image_path).convert("RGB")
        plt.imshow(img)
        plt.title(f"Image: {filename}")
        plt.axis('off')
        plt.show()

        resnet_prediction, clip_prediction = get_predictions(resnet50_imagenet, clip_RN50, preprocess, image_path, imagenet_labels, device)

        print(f"== Predictions for: {filename} ==")

        for label, score in resnet_prediction:
            print(f"ResNet Label: {label}, Score: {score:.4f}")

        print()

        for label, score in clip_prediction:
            print(f"CLIP Label: {label}, Score: {score:.4f}")
      

        print("\n" + "="*100 + "\n")


`5. FP16`

`5.1 Estimating the wall-clock time taken by fp16 and fp32`

In [None]:
"""
To understand what wall - clock time is : https://stackoverflow.com/questions/7335920/what-specifically-are-wall-clock-time-user-cpu-time-and-system-cpu-time-in-uni

Briefly : The wall-clock time is not the number of seconds that the process has spent on the CPU; it is the elapsed time, 
including time spent waiting for its turn on the CPU (while other processes get to run).
"""

def measure_the_wall_clock_inference_time(model, image, RUNS=100):
    """
    This function computes the `wall-clock` time taken to encode an image and we consider
    total of 100 runs ad default
    """

    times = []
    with torch.no_grad():

        for _ in range(RUNS):

            start = time.time()
            _ = model.encode_image(image)
            torch.cuda.synchronize()

            times.append(time.time() - start)

    return np.mean(times), np.std(times)

In [None]:
clip_RN50_fp32 , preprocess = load_clip_model(model="RN50")

In [None]:
image_path = "../../data/external/3/imagenet-mini/train/n01443537/n01443537_1298.JPEG"

source_image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)


fp32_mean, fp32_std = measure_the_wall_clock_inference_time(model = clip_RN50_fp32, 
                                                            image=source_image, RUNS=1000)

print(f"FP32: {fp32_mean:.6f}s ± {fp32_std:.6f}s")


In [None]:
"""
To convert the model to half precision : https://discuss.pytorch.org/t/converting-model-into-16-points-precisoin-float16-instead-of-32/102622/12

TLDR : model.half() will transform all parameters and buffers to float16 and NOTE to convert also the image since its an input (also mentioned in the above ref)
"""

clip_rn50_fp16 = clip_RN50_fp32.half()
source_image_fp16 = source_image.half()


In [None]:
fp16_mean, fp16_std = measure_the_wall_clock_inference_time(model = clip_rn50_fp16,
                                                             image= source_image_fp16 ,
                                                             RUNS=100)

print(f"FP16: {fp16_mean:.6f}s ± {fp16_std:.6f}s")

In [None]:
speedup = fp32_mean / fp16_mean

table = [
    ["Precision", "Mean Time (s)", "Std Dev (s)"],
    ["FP32", f"{fp32_mean:.6f}", f"{fp32_std:.6f}"],
    ["FP16", f"{fp16_mean:.6f}", f"{fp16_std:.6f}"]
]

print(tabulate(table, headers="firstrow", tablefmt="grid"))

print(f"\nSpeedup (FP32 / FP16): {speedup:.2f}x")


`5.2 Probabilities computed by fp16 and fp32`

In [None]:
def classify_with_clip_for_prob_check(model, preprocess, image_path, class_labels, use_fp16_for_image=False):
    """
    Classify an image with CLIP and return probabilities for all classes.
    If use_fp16_for_image is True, image encoding is done in FP16.
    """

    image = Image.open(image_path).convert('RGB')
    image_input = preprocess(image).unsqueeze(0).to(device)
    
    text_inputs = torch.cat([clip.tokenize(f"a photo of a {label}") for label in class_labels]).to(device)

    with torch.no_grad():
        if use_fp16_for_image:
            image_input = image_input.half()
            image_features = model.encode_image(image_input)
        else:
            image_features = model.encode_image(image_input)

        text_features = model.encode_text(text_inputs)

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1).squeeze(0)

    return similarity.cpu().numpy()


In [None]:
def compare_classification_outputs(model_fp32, model_fp16, preprocess):
    """
    This function compares the classification outputs between FP32 and FP16 models for 5 images.
    Prints a detailed, nicely formatted report with tabulated summary.
    """
    
    NUM_IMAGES = 5
    results = []

    selected_indices = random.sample(range(len(synset_ids)), NUM_IMAGES)

    for idx in selected_indices:
        synset_id = synset_ids[idx]
        synset_dir = os.path.join(train_dir, synset_id)

        image_files = [f for f in os.listdir(synset_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
 
 
        image_file = random.choice(image_files)
        image_path = os.path.join(synset_dir, image_file)
        true_label = human_readable_labels[idx]

        # FP32
        model_fp32 = model_fp32.float()
        probs_fp32 = classify_with_clip_for_prob_check(model_fp32, preprocess, image_path, human_readable_labels)

        # FP16
        probs_fp16 = classify_with_clip_for_prob_check(model_fp16, preprocess, image_path, human_readable_labels, use_fp16_for_image=True)

        abs_diff = np.abs(probs_fp32 - probs_fp16).max()
        rel_diff = abs_diff / np.maximum(probs_fp32.max(), 1e-10)

        top5_indices_fp32 = np.argsort(probs_fp32)[-5:][::-1]
        top5_indices_fp16 = np.argsort(probs_fp16)[-5:][::-1]

        top1_match = top5_indices_fp32[0] == top5_indices_fp16[0]
        top5_overlap = len(set(top5_indices_fp32).intersection(set(top5_indices_fp16)))

        results.append({
            'Class': true_label,
            'Image': os.path.basename(image_path),
            'Max Abs Diff': abs_diff,
            'Relative Diff': rel_diff,
            'Top-1 Match': top1_match,
            'Top-5 Overlap': top5_overlap
        })

        # [PROMPT] : Write a helper function to plot the probabilities and the predictions and also tabulate the final results 

        print(f"\n{'='*60}")
        print(f"Image: {os.path.basename(image_path)}")
        print(f"True Class: {true_label}")
        print(f"{'-'*60}")
        print("Top-5 FP32 Predictions:")
        for i in top5_indices_fp32:
            print(f"  {human_readable_labels[i]:<25} {probs_fp32[i]:.6f}")
        print("Top-5 FP16 Predictions:")
        for i in top5_indices_fp16:
            print(f"  {human_readable_labels[i]:<25} {probs_fp16[i]:.6f}")
        print(f"{'-'*60}")
        print(f"Top-1 Match: {top1_match}")
        print(f"Top-5 Overlap: {top5_overlap}/5")
        print(f"Max Abs Difference: {abs_diff:.6f}")
        print(f"Relative Difference: {rel_diff:.6f}")

        plt.figure(figsize=(12, 8))
        img = plt.imread(image_path)
        plt.subplot(1, 2, 1)
        plt.imshow(img)
        plt.title(f"True class: {true_label}")
        plt.axis('off')

        plt.subplot(1, 2, 2)
        sorted_indices = np.argsort(np.abs(probs_fp32 - probs_fp16))[-10:]
        labels = [human_readable_labels[i][:20] for i in sorted_indices]
        fp32_values = [probs_fp32[i] for i in sorted_indices]
        fp16_values = [probs_fp16[i] for i in sorted_indices]

        x = np.arange(len(labels))
        width = 0.35
        plt.bar(x - width/2, fp32_values, width, label='FP32')
        plt.bar(x + width/2, fp16_values, width, label='FP16')
        plt.xlabel('Class')
        plt.ylabel('Probability')
        plt.title('Top Probability Differences: FP32 vs FP16')
        plt.xticks(x, labels, rotation=45, ha='right')
        plt.legend()
        plt.tight_layout()
        plt.show()

    df = pd.DataFrame(results)

    print("\n\n Summary of Differences Between FP32 and FP16 Outputs:\n")
    print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

    return df


In [None]:
# results_df = compare_classification_outputs(clip_RN50_fp32, clip_rn50_fp16, preprocess)

#### Why Is There No Significant Difference Between FP32 and FP16 Outputs?

The minimal difference between FP32 and FP16 outputs can be attributed to the following technical reasons:

 1. **Robustness of Neural Networks to Quantization**
- Neural networks, especially large pre-trained models like CLIP or ResNet, are **inherently tolerant to small perturbations** in weights and activations.
- Reducing precision from 32-bit to 16-bit introduces only minor noise, which the model is usually able to **absorb without affecting output rankings**.

2. **Normalization Layers Minimize Precision Sensitivity**
- Models often use **LayerNorm or BatchNorm**, which rescale activations and help **stabilize outputs across different precisions**.
- These layers reduce the risk of large value ranges that could be affected by FP16's lower dynamic range.


3. **Well-Calibrated Probabilities**
- For tasks like image classification, final softmax probabilities tend to **saturate near 1 for the correct class** and decay quickly for others.
- This leads to **large margins** between correct and incorrect predictions, making them **insensitive to small probability changes**.



`5.3 Memory Usage - forward pass `

In [None]:
def get_gpu_memory_usage():
    """
    This function returns the current GPU Usage
    """
    
    result = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader'])
    return int(result)

In [None]:



def profile_model(model_type, model, preprocess, image_path, num_runs=100):
    
    print(f"\n--- Profiling {model_type} model with {num_runs} runs ---")
    
    # Clear cache before starting
    torch.cuda.empty_cache()
    time.sleep(1)
    
    # Get baseline memory usage
    baseline = get_gpu_memory_usage()
    print(f"Baseline memory usage: {baseline} MB")
    
    # Load image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to("cuda")
    if model_type == "FP16":
        image = image.half()
    
    # Get memory after loading image
    after_image_load = get_gpu_memory_usage()
    print(f"Memory after loading image: {after_image_load} MB (+ {after_image_load - baseline} MB)")
    
    # Perform forward passes
    forward_times = []
    max_memory = after_image_load
    
    for i in range(num_runs):
        with torch.no_grad():
            start_time = time.time()
            image_features = model.encode_image(image)
            torch.cuda.synchronize()  # Make sure GPU operations are completed
            end_time = time.time()
            
        forward_times.append((end_time - start_time) * 1000)  # Convert to ms
        
        # Check memory after each forward pass
        current_memory = get_gpu_memory_usage()
        max_memory = max(max_memory, current_memory)
        
        # Print progress for every 10 runs
        if (i + 1) % 10 == 0:
            print(f"Completed {i + 1}/{num_runs} runs. Current memory: {current_memory} MB")
    
    # Calculate statistics
    avg_time = np.mean(forward_times)
    std_time = np.std(forward_times)
    
    # Get final memory usage
    final_memory = get_gpu_memory_usage()
    memory_increase = max_memory - after_image_load
    
    print(f"Max memory after {num_runs} forward passes: {max_memory} MB (+ {memory_increase} MB from image load)")
    print(f"Final memory after {num_runs} forward passes: {final_memory} MB")
    print(f"Average forward pass time: {avg_time:.2f} ms (std: {std_time:.2f} ms)")
    
    torch_allocated = torch.cuda.memory_allocated() / (1024 * 1024)  # Convert to MB
    torch_reserved = torch.cuda.memory_reserved() / (1024 * 1024)  # Convert to MB
    print(f"PyTorch reported memory - Allocated: {torch_allocated:.2f} MB, Reserved: {torch_reserved:.2f} MB")
    
    return {
        "max_memory_increase": max_memory - baseline,
        "nvidia_smi_increase": memory_increase,
        "avg_time": avg_time,
        "std_time": std_time,
        "torch_allocated": torch_allocated,
        "torch_reserved": torch_reserved
    }

In [None]:


# Main execution
model_name = "RN50"  

model_fp32, preprocess = clip.load(model_name, device="cuda")
fp32_results = profile_model("FP32", model_fp32, preprocess, image_path)

del model_fp32
torch.cuda.empty_cache()
time.sleep(3)  

model_fp16, _ = clip.load(model_name, device="cuda")
model_fp16 = model_fp16.half()
fp16_results = profile_model("FP16", model_fp16, preprocess, image_path)

print("\n" + "="*60)
print("DETAILED COMPARISON BETWEEN FP32 AND FP16")
print("="*60)

print(f"Max Memory Increase (nvidia-smi):")
print(f"  FP32: {fp32_results['max_memory_increase']} MB")
print(f"  FP16: {fp16_results['max_memory_increase']} MB")
print(f"  Savings: {fp32_results['max_memory_increase'] - fp16_results['max_memory_increase']} MB")
print(f"  Reduction: {(1 - fp16_results['max_memory_increase']/fp32_results['max_memory_increase'])*100:.2f}%")

print(f"\nPyTorch Allocated Memory:")
print(f"  FP32: {fp32_results['torch_allocated']:.2f} MB")
print(f"  FP16: {fp16_results['torch_allocated']:.2f} MB")
print(f"  Savings: {fp32_results['torch_allocated'] - fp16_results['torch_allocated']:.2f} MB")
print(f"  Reduction: {(1 - fp16_results['torch_allocated']/fp32_results['torch_allocated'])*100:.2f}%")

print(f"\nPyTorch Reserved Memory:")
print(f"  FP32: {fp32_results['torch_reserved']:.2f} MB")
print(f"  FP16: {fp16_results['torch_reserved']:.2f} MB")
print(f"  Savings: {fp32_results['torch_reserved'] - fp16_results['torch_reserved']:.2f} MB")
print(f"  Reduction: {(1 - fp16_results['torch_reserved']/fp32_results['torch_reserved'])*100:.2f}%")


In [None]:
import torch
import torch.profiler
import time
from PIL import Image
import numpy as np

def profile_model_with_pytorch(model_type, model, preprocess, image_path, num_runs=100):
    """
    Profiles the memory usage and forward pass time for a model (FP32 or FP16) using PyTorch's built-in profiler.
    """
    
    # Clear cache before starting
    torch.cuda.empty_cache()
    time.sleep(1)
    
    # Load image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to("cuda")
    if model_type == "FP16":
        image = image.half()  # Convert to FP16

    # Initialize PyTorch Profiler
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU, 
            torch.profiler.ProfilerActivity.CUDA
        ],
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
        record_shapes=True,
        with_stack=True
    ) as profiler:
        for _ in range(num_runs):
            with torch.no_grad():
                start_time = time.time()
                model.encode_image(image)
                torch.cuda.synchronize()  # Ensure GPU operations are completed
                end_time = time.time()

                # Record memory usage and time
                profiler.step()  # Step through the profiler for each iteration
            forward_time = (end_time - start_time) * 1000  # Convert to milliseconds
            print(f"Forward pass time: {forward_time:.2f} ms")
    
    # You can visualize the profile using TensorBoard
    # To do this, open the TensorBoard interface with the following:
    # tensorboard --logdir=./log

# Example usage
profile_model_with_pytorch("FP32", clip_RN50_fp32, preprocess, image_path)
profile_model_with_pytorch("FP16", clip_rn50_fp16, preprocess, image_path)
