In [1]:
import clip
import wandb
import os
import numpy as np
import torch
import torch.nn as nn
from torch.cuda import amp

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
import wandb

In [4]:
wandb.init(project="clip_cls_36", id="3kuwd2c3", resume='must')
CONFIG = wandb.config

[34m[1mwandb[0m: Currently logged in as: [33mshivamshrirao[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
features_cache = f"{CONFIG['clip_type']}_features.pth"
ft_dict = torch.load(features_cache)
train_features = ft_dict["train_features"]
train_labels = ft_dict["train_labels"]
test_features = ft_dict["test_features"]
test_labels = ft_dict["test_labels"]

In [6]:
torch.unique(train_labels, return_counts=True)

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
        device='cuda:0'),
 tensor([3827, 3798, 3831, 3780, 3758, 3755, 3745, 3763, 3598, 3593, 3585, 3582,
         3594, 3561, 3560, 3543, 3575, 3544, 3542, 3541, 3560, 3558, 3563, 3558,
         3536, 3574, 3532, 3567, 3543, 3528, 3524, 3534, 3566, 3554, 3544, 3568],
        device='cuda:0'))

In [7]:
torch.unique(test_labels, return_counts=True)

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
        device='cuda:0'),
 tensor([189, 200, 166, 179, 199, 212, 209, 196, 185, 186, 188, 186, 176, 202,
         199, 210, 169, 203, 189, 199, 181, 181, 174, 179, 190, 170, 211, 173,
         191, 204, 210, 201, 170, 182, 184, 192], device='cuda:0'))

In [8]:
# cnv = {}
# for i in range(8):
#     x = i*45
#     if i%2:
#         cnv[x-5] = cnv[x+5] = i
#     else:
#         cnv[(x-10)%360] = cnv[x] = cnv[(x+10)%360] = i
# cnv = {k//10:v for k,v in cnv.items()}

# for i in range(len(train_labels)):
#     try:
#         train_labels[i] = cnv[train_labels[i].item()]
#     except KeyError:
#         train_labels[i] = 8
# for i in range(len(test_labels)):
#     try:
#         test_labels[i] = cnv[test_labels[i].item()]
#     except KeyError:
#         test_labels[i] = 8

In [9]:
num_classes = 36

In [10]:
class QuickGELU(nn.Module):
    def forward(self, x: torch.Tensor):
        return x * torch.sigmoid(1.702 * x)

In [50]:
get_activation = {
    'q_gelu': QuickGELU,
    'relu': nn.ReLU,
    'elu': nn.ELU,
    'leaky_relu': nn.LeakyReLU
}

# cls_head = nn.Sequential(
#     # nn.Dropout(CONFIG["dropout"]),
#     nn.Linear(len(train_features[0]), CONFIG["hid_dim"]),
# #     get_activation[CONFIG["activation"]](),
#     get_activation['relu'](),
#     nn.Dropout(CONFIG["dropout"]),
#     nn.Linear(CONFIG["hid_dim"], num_classes)
# )
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.mean = 255 * torch.tensor([0.485, 0.456, 0.406], dtype=torch.float16, device=device).reshape(1, 3, 1, 1)
        self.std = 255 * torch.tensor([0.229, 0.224, 0.225], dtype=torch.float16, device=device).reshape(1, 3, 1, 1)
        self.clip_model, preprocess = clip.load(CONFIG["clip_type"], device)
        self.clip_model = self.clip_model.float()
        self.cls_head = nn.Sequential(
            # nn.Dropout(CONFIG["dropout"]),
            nn.LazyLinear(CONFIG["hid_dim"]),
            get_activation[CONFIG["activation"]](),
            nn.Dropout(CONFIG["dropout"]),
            nn.Linear(CONFIG["hid_dim"], num_classes)
        ).to(device).train()

    def forward(self, x):
        x = x.permute(0,3,1,2)
        x = (x - self.mean).div_(self.std)
        x = self.clip_model.visual(x)
        x = self.cls_head(x)
        return x
model = Classifier()

In [51]:
model.load_state_dict(torch.load("weights/new2.pth"))

<All keys matched successfully>

In [32]:
data = train_features[:256]
print(data.shape)

torch.Size([256, 512])


In [2]:
import cv2
im_path = "/home/ubuntu/clip_classifier/data/rbg_test/1_VCC_1034828_24_1613203232.jpg.jpg"
im = cv2.imread(im_path, cv2.IMREAD_GRAYSCALE)
im = cv2.cvtColor(im, cv2.COLOR_GRAY2RGB)
im = cv2.resize(im,(224,224))[None]
image = torch.from_numpy(im).cuda()
image.shape

torch.Size([1, 224, 224, 3])

In [None]:
image

In [53]:
class Wrapped_linear_model(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    @torch.inference_mode()
    def forward(self, data, fp16=True):
        with amp.autocast(enabled=fp16):
            x = self.model(data)
            x = torch.softmax(x, dim=1)
            # x = x.argmax(dim=1,keepdim=True)
        return x
wrp_model = Wrapped_linear_model(model).to(device).eval()

In [54]:
%%time
torch.cuda.synchronize()
with torch.no_grad():
    svd_out = wrp_model(image, True)
torch.cuda.synchronize()
print(svd_out.shape, svd_out.dtype)

torch.Size([1, 36]) torch.float32
CPU times: user 9.62 ms, sys: 694 µs, total: 10.3 ms
Wall time: 9.05 ms


In [57]:
with torch.inference_mode(), torch.jit.optimized_execution(True):
    traced_script_module = torch.jit.trace(wrp_model, image)
    traced_script_module = torch.jit.optimize_for_inference(traced_script_module)

print(traced_script_module.code)

def forward(self,
    data: Tensor) -> Tensor:
  x = torch.permute(data, [0, 3, 1, 2])
  input = torch.div_(torch.sub(x, CONSTANTS.c0), CONSTANTS.c1)
  _0 = torch.conv2d(input, CONSTANTS.c2, None, [32, 32])
  _1 = [torch.size(_0, 0), torch.size(_0, 1), -1]
  x0 = torch.reshape(_0, _1)
  x1 = torch.permute(x0, [0, 2, 1])
  _2 = [torch.size(x1, 0), 1, torch.size(x1, 2)]
  _3 = torch.zeros(_2, dtype=5, layout=None, device=torch.device("cuda:0"), pin_memory=False)
  x2 = torch.cat([torch.add(CONSTANTS.c3, _3), x1], 1)
  x3 = torch.add(x2, CONSTANTS.c4)
  input0 = torch.to(x3, 6)
  ret = torch.layer_norm(input0, [768], CONSTANTS.c5, CONSTANTS.c6)
  x4 = torch.to(ret, 5)
  x5 = torch.permute(x4, [1, 0, 2])
  input1 = torch.to(x5, 6)
  ret0 = torch.layer_norm(input1, [768], CONSTANTS.c7, CONSTANTS.c8)
  query = torch.to(ret0, 5)
  _4 = torch.size(query, 0)
  _5 = torch.size(query, 1)
  bsz = ops.prim.NumToTensor(_5)
  _6 = torch.size(query, 2)
  embed_dim = ops.prim.NumToTensor(_6)
  head_dim

In [58]:
OUT_PATH = "car_angle_classifier_36/2/"
os.makedirs(OUT_PATH, exist_ok=True)

traced_script_module.save(f"{OUT_PATH}/model.pt")
traced_script_module = torch.jit.load(f"{OUT_PATH}/model.pt")

In [61]:
%%time
torch.cuda.synchronize()
with torch.no_grad():
    o = traced_script_module(image)
torch.cuda.synchronize()
print(o.shape, o.dtype)

torch.Size([1, 36]) torch.float32
CPU times: user 10.2 s, sys: 747 µs, total: 10.2 s
Wall time: 10.5 s


# Optional: Trace new backbones

In [45]:
model, preprocess = clip.load(CONFIG["clip_type"], device)

In [47]:
class WrappedModel(torch.nn.Module):
    def __init__(self,model):
        super().__init__()
        self.model = model
        self.mean = 255 * torch.tensor([0.48145466, 0.4578275, 0.40821073]).to(device).reshape(1,3,1,1)
        self.std = 255 * torch.tensor([0.26862954, 0.26130258, 0.27577711]).to(device).reshape(1,3,1,1)

    @torch.inference_mode()
    def forward(self, data, fp16=True):
        data = data.permute(0,3,1,2)
        with amp.autocast(enabled=fp16):
            data = data.sub(self.mean)
            data = data.div_(self.std)
            image_features = self.model.visual(data.to(self.model.dtype))
            return image_features

wrp_model = WrappedModel(model).eval().cuda()

In [48]:
data = torch.randint(0,255,(256,224,224,3), dtype=torch.uint8, device=device)

In [49]:
%%time
torch.cuda.synchronize()
with torch.no_grad():
    svd_out = wrp_model(data)
torch.cuda.synchronize()
print(svd_out.shape, svd_out.dtype)

torch.Size([256, 1024]) torch.float16
CPU times: user 144 ms, sys: 20 µs, total: 144 ms
Wall time: 143 ms


In [37]:
with torch.inference_mode(), torch.jit.optimized_execution(True):
    traced_script_module = torch.jit.trace(wrp_model, data)
    traced_script_module = torch.jit.optimize_for_inference(traced_script_module)
print(traced_script_module.code)

def forward(self,
    data: Tensor) -> Tensor:
  data0 = torch.permute(data, [0, 3, 1, 2])
  data1 = torch.sub(data0, CONSTANTS.c0)
  data2 = torch.div_(data1, CONSTANTS.c1)
  input = torch.to(data2, 5)
  _0 = torch.conv2d(input, CONSTANTS.c2, None, [32, 32])
  _1 = [torch.size(_0, 0), torch.size(_0, 1), -1]
  x = torch.reshape(_0, _1)
  x0 = torch.permute(x, [0, 2, 1])
  _2 = [torch.size(x0, 0), 1, torch.size(x0, 2)]
  _3 = torch.zeros(_2, dtype=5, layout=None, device=torch.device("cuda:0"), pin_memory=False)
  x1 = torch.cat([torch.add(CONSTANTS.c3, _3), x0], 1)
  x2 = torch.add(x1, CONSTANTS.c4)
  input0 = torch.to(x2, 6)
  ret = torch.layer_norm(input0, [768], CONSTANTS.c5, CONSTANTS.c6)
  x3 = torch.to(ret, 5)
  x4 = torch.permute(x3, [1, 0, 2])
  input1 = torch.to(x4, 6)
  ret0 = torch.layer_norm(input1, [768], CONSTANTS.c7, CONSTANTS.c8)
  query = torch.to(ret0, 5)
  _4 = torch.size(query, 0)
  _5 = torch.size(query, 1)
  bsz = ops.prim.NumToTensor(_5)
  _6 = torch.size(query, 2

In [38]:
OUT_PATH = f"trace_{CONFIG['clip_type']}"
os.makedirs(OUT_PATH, exist_ok=True)

traced_script_module.save(f"{OUT_PATH}/model.pt")
traced_script_module = torch.jit.load(f"{OUT_PATH}/model.pt")

In [42]:
%%time
torch.cuda.synchronize()
with torch.inference_mode():
    o = traced_script_module(data)
torch.cuda.synchronize()
print(o.shape, o.dtype)

torch.Size([256, 512]) torch.float16
CPU times: user 82.4 ms, sys: 0 ns, total: 82.4 ms
Wall time: 81.3 ms


In [40]:
np.testing.assert_allclose(o.cpu().numpy(), svd_out.cpu().numpy(), rtol=1e-5, atol=1e-3)