# Task-1

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from einops import rearrange
class SwinEmbedding(nn.Module):
    def __init__(self, patch_size=4, C=96):
        super().__init__()
        self.linear_embedding = nn.Conv2d(3, C, kernel_size=patch_size, stride=patch_size)
        self.layer_norm = nn.LayerNorm(C)
        self.relu = nn.ReLU()

    def forward(self,x):
        x = self.linear_embedding(x)
        x = rearrange(x, 'b c h w -> b (h w) c')
        x = self.relu(self.layer_norm(x))
        return x

In [4]:
swinEmbeddings = SwinEmbedding()
a = torch.rand((32,3,512,512))
print(a.shape)
print(swinEmbeddings(a).shape)

torch.Size([32, 3, 512, 512])
torch.Size([32, 16384, 96])


# Task-2

In [8]:
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
class CLIPPredictor(nn.Module):
    def __init__(self):
        super().__init__()
        self.Model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.Processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.TextInputs = self.Processor(text=["we see a cat", "we see a dog"], return_tensors="pt", padding=True)
        with torch.no_grad():
            self.TextEmbeddings = self.Model.get_text_features(**self.TextInputs)

    def forward(self, URL):
        Picture = Image.open(requests.get(URL, stream=True).raw)
        ImageInputs = self.Processor(images=Picture, return_tensors="pt", padding=True)
        ImageEmbeddings = self.Model.get_image_features(**ImageInputs)
        LogitsPerImage = ImageEmbeddings @ self.TextEmbeddings.T
        LogitsPerImage = LogitsPerImage.log_softmax(dim=-1)

        Probabilities = LogitsPerImage.exp()

        return Probabilities

In [9]:
clipPred = CLIPPredictor()
url = "http://images.cocodataset.org/val2017/img.jpg"
probs = clipPred(url)
probs

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tensor([[0.9603, 0.0397]], grad_fn=<ExpBackward0>)